arm_compute v20.05
diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
index 5613e6c..492c54e 100644
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,7 +32,12 @@
void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLAbsoluteDifference::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
- k->configure(input1, input2, output);
+ k->configure(compile_context, input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
index 78f25fc..a81d1d0 100644
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ b/src/runtime/CL/functions/CLAccumulate.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,21 +32,36 @@
void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, accum);
+}
+
+void CLAccumulate::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum)
+{
auto k = arm_compute::support::cpp14::make_unique<CLAccumulateKernel>();
- k->configure(input, accum);
+ k->configure(compile_context, input, accum);
_kernel = std::move(k);
}
void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum);
+}
+
+void CLAccumulateWeighted::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum)
+{
auto k = arm_compute::support::cpp14::make_unique<CLAccumulateWeightedKernel>();
- k->configure(input, alpha, accum);
+ k->configure(compile_context, input, alpha, accum);
_kernel = std::move(k);
}
void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum);
+}
+
+void CLAccumulateSquared::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum)
+{
auto k = arm_compute::support::cpp14::make_unique<CLAccumulateSquaredKernel>();
- k->configure(input, shift, accum);
+ k->configure(compile_context, input, shift, accum);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 00dbb71..989603a 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,7 +26,7 @@
#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLRuntimeContext.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
@@ -37,10 +37,13 @@
void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
{
- auto core_ctx = _ctx ? _ctx->core_runtime_context() : /* Legacy */ nullptr;
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
+}
- auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>(core_ctx);
- k->configure(input, output, act_info);
+void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
+ k->configure(compile_context, input, output, act_info);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index 4ac6d25..5b4c694 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -106,6 +106,11 @@
void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op);
+}
+
+void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
_reduction_axis = axis;
@@ -121,7 +126,7 @@
// Create temporary tensors
if(_num_of_stages == 1)
{
- _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op);
+ _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_not_reshaped_output, axis, op);
}
else
{
@@ -135,22 +140,22 @@
// Apply ReductionOperation only on first kernel
_memory_group.manage(&_results_vector[0]);
- _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op);
+ _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_results_vector[0], axis, op);
// Apply ReductionOperation on intermediate stages
for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
{
_memory_group.manage(&_results_vector[i]);
- _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i], axis, op);
+ _reduction_kernels_vector[i].configure(compile_context, input, &_results_vector[i - 1], &_results_vector[i], axis, op);
_results_vector[i - 1].allocator()->allocate();
}
// Apply ReductionOperation on the last stage
const unsigned int last_stage = _num_of_stages - 1;
- _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op);
+ _reduction_kernels_vector[last_stage].configure(compile_context, input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op);
_results_vector[last_stage - 1].allocator()->allocate();
}
- _reshape_kernel.configure(&_not_reshaped_output, output);
+ _reshape_kernel.configure(compile_context, &_not_reshaped_output, output);
_not_reshaped_output.allocator()->allocate();
}
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index f87ea6e..9fc5113 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,7 +40,14 @@
void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
ActivationLayerInfo act_info)
{
- _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
+}
+
+void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
+ const ICLTensor *gamma, float epsilon,
+ ActivationLayerInfo act_info)
+{
+ _norm_kernel.configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
}
Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index 7919b13..0a2ae2a 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,12 +39,22 @@
void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
{
- _batch_to_space_kernel.configure(input, block_shape, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
+}
+
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+{
+ _batch_to_space_kernel.configure(compile_context, input, block_shape, output);
}
void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
{
- _batch_to_space_kernel.configure(input, block_shape_x, block_shape_y, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output);
+}
+
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
+{
+ _batch_to_space_kernel.configure(compile_context, input, block_shape_x, block_shape_y, output);
}
Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index f8a5a85..1fa80f0 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,7 +32,12 @@
void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLBitwiseAndKernel>();
- k->configure(input1, input2, output);
+ k->configure(compile_context, input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index dc002e5..4659519 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,7 +32,12 @@
void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLBitwiseNotKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index 4a10bb2..8431140 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,7 +32,12 @@
void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLBitwiseOrKernel>();
- k->configure(input1, input2, output);
+ k->configure(compile_context, input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index d23622a..0e0e7f2 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,7 +32,12 @@
void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLBitwiseXorKernel>();
- k->configure(input1, input2, output);
+ k->configure(compile_context, input1, input2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 46a6b8e..55bcde7 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,15 +24,20 @@
#include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
+}
+
+void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+{
// Configure Bounding Box kernel
auto k = arm_compute::support::cpp14::make_unique<CLBoundingBoxTransformKernel>();
- k->configure(boxes, pred_boxes, deltas, info);
+ k->configure(compile_context, boxes, pred_boxes, deltas, info);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
index f28be44..72c8221 100644
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ b/src/runtime/CL/functions/CLBox3x3.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
#include "arm_compute/core/PixelValue.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,13 @@
void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLBox3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLBox3x3Kernel>();
- k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index dbaea81..0c8d353 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,7 +31,7 @@
#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
@@ -58,6 +58,13 @@
void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, gradient_size, norm_type, border_mode, constant_border_value);
+}
+
+void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type,
+ BorderMode border_mode,
+ uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
@@ -111,19 +118,19 @@
if(gradient_size == 3)
{
auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
- k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+ k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
else if(gradient_size == 5)
{
auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
- k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+ k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
else if(gradient_size == 7)
{
auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
- k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+ k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
}
else
@@ -136,7 +143,7 @@
_memory_group.manage(&_phase);
// Configure gradient
- _gradient.configure(&_gx, &_gy, &_mag, &_phase, norm_type);
+ _gradient.configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type);
// Allocate intermediate buffers
_gx.allocator()->allocate();
@@ -146,14 +153,14 @@
_memory_group.manage(&_nonmax);
// Configure non-maxima suppression
- _non_max_suppr.configure(&_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
+ _non_max_suppr.configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
// Allocate intermediate buffers
_phase.allocator()->allocate();
// Fill border around magnitude image as non-maxima suppression will access
// it. If border mode is undefined filling the border is a nop.
- _border_mag_gradient.configure(&_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
+ _border_mag_gradient.configure(compile_context, &_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
// Allocate intermediate buffers
_mag.allocator()->allocate();
@@ -165,7 +172,7 @@
_memory_group.manage(&_l1_list_counter);
// Configure edge tracing
- _edge_trace.configure(&_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
+ _edge_trace.configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
// Allocate intermediate buffers
_visited.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index e0ffcdb..7048a79 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLCast.h"
#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,8 +32,13 @@
{
void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, policy);
+}
+
+void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+{
auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
- k->configure(input, output, policy, 0);
+ k->configure(compile_context, input, output, policy, 0);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
index 11605cf..249212e 100644
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,14 +32,24 @@
void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output);
+}
+
+void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
- k->configure(plane0, plane1, plane2, plane3, output);
+ k->configure(compile_context, plane0, plane1, plane2, plane3, output);
_kernel = std::move(k);
}
void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output);
+}
+
+void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
- k->configure(plane0, plane1, plane2, output);
+ k->configure(compile_context, plane0, plane1, plane2, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
index 5090382..019e0a7 100644
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,14 +32,24 @@
void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
+}
+
+void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
- k->configure(input, channel, output);
+ k->configure(compile_context, input, channel, output);
_kernel = std::move(k);
}
void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
+}
+
+void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
- k->configure(input, channel, output);
+ k->configure(compile_context, input, channel, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index ff50073..93ab7c7 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,14 +25,19 @@
#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
#include "arm_compute/core/Types.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
+}
+
+void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+{
auto k = arm_compute::support::cpp14::make_unique<CLChannelShuffleLayerKernel>();
- k->configure(input, output, num_groups);
+ k->configure(compile_context, input, output, num_groups);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
index 65f8ac3..b8e5977 100644
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ b/src/runtime/CL/functions/CLColorConvert.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,28 +32,48 @@
void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index 86c9c31..8d5ec35 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,14 +26,19 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
#include "arm_compute/core/Types.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
+}
+
+void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+{
auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
- k->configure(input1, input2, output, operation);
+ k->configure(compile_context, input1, input2, output, operation);
_kernel = std::move(k);
if(output->info()->dimension(0) > 1)
@@ -42,7 +47,7 @@
if(broadcasted_info->info()->dimension(0) == 1)
{
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
}
}
}
@@ -55,8 +60,14 @@
template <ComparisonOperation COP>
void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+template <ComparisonOperation COP>
+void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
- k->configure(input1, input2, output, COP);
+ k->configure(compile_context, input1, input2, output, COP);
_kernel = std::move(k);
if(output->info()->dimension(0) > 1)
@@ -65,7 +76,7 @@
if(broadcasted_info->info()->dimension(0) == 1)
{
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
}
}
}
diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
index 24c152f..62714fe 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,15 +23,20 @@
*/
#include "arm_compute/runtime/CL/functions/CLComputeAllAnchors.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLComputeAllAnchors::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info);
+}
+
+void CLComputeAllAnchors::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+{
// Configure ComputeAllAnchors kernel
auto k = arm_compute::support::cpp14::make_unique<CLComputeAllAnchorsKernel>();
- k->configure(anchors, all_anchors, info);
+ k->configure(compile_context, anchors, all_anchors, info);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 5d224db..e972567 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,7 +36,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
@@ -49,12 +49,22 @@
void CLConcatenateLayer::configure(std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
{
- configure_internal(std::move(inputs_vector), output, axis);
+ configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
+}
+
+void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+ configure_internal(compile_context, std::move(inputs_vector), output, axis);
}
void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
{
- configure_internal(std::move(inputs_vector), output, axis);
+ configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
+}
+
+void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+ configure_internal(compile_context, std::move(inputs_vector), output, axis);
}
Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
@@ -68,7 +78,7 @@
}
template <typename TensorType>
-void CLConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_context, std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis)
{
ARM_COMPUTE_ERROR_ON(output == nullptr);
_axis = axis;
@@ -97,7 +107,7 @@
{
// Configure WidthConcatenate2Tensors kernel
auto kernel = support::cpp14::make_unique<CLWidthConcatenate2TensorsKernel>();
- kernel->configure(inputs_vector.at(0), inputs_vector.at(1), output);
+ kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), output);
_concat_kernels.emplace_back(std::move(kernel));
break;
}
@@ -105,7 +115,7 @@
{
// Configure WidthConcatenate4Tensors kernel
auto kernel = support::cpp14::make_unique<CLWidthConcatenate4TensorsKernel>();
- kernel->configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
+ kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
_concat_kernels.emplace_back(std::move(kernel));
break;
}
@@ -115,7 +125,7 @@
for(unsigned int i = 0; i < _num_inputs; ++i)
{
auto kernel = support::cpp14::make_unique<CLWidthConcatenateLayerKernel>();
- kernel->configure(inputs_vector.at(i), offset, output);
+ kernel->configure(compile_context, inputs_vector.at(i), offset, output);
offset += inputs_vector.at(i)->info()->dimension(_axis);
_concat_kernels.emplace_back(std::move(kernel));
}
@@ -129,7 +139,7 @@
for(unsigned int i = 0; i < _num_inputs; ++i)
{
auto kernel = support::cpp14::make_unique<CLHeightConcatenateLayerKernel>();
- kernel->configure(inputs_vector.at(i), offset, output);
+ kernel->configure(compile_context, inputs_vector.at(i), offset, output);
offset += inputs_vector.at(i)->info()->dimension(_axis);
_concat_kernels.emplace_back(std::move(kernel));
}
@@ -140,7 +150,7 @@
for(unsigned int i = 0; i < _num_inputs; ++i)
{
auto kernel = support::cpp14::make_unique<CLDepthConcatenateLayerKernel>();
- kernel->configure(inputs_vector.at(i), offset, output);
+ kernel->configure(compile_context, inputs_vector.at(i), offset, output);
offset += inputs_vector.at(i)->info()->dimension(_axis);
_concat_kernels.emplace_back(std::move(kernel));
}
@@ -151,7 +161,7 @@
for(unsigned int i = 0; i < _num_inputs; ++i)
{
auto kernel = support::cpp14::make_unique<CLBatchConcatenateLayerKernel>();
- kernel->configure(inputs_vector.at(i), offset, output);
+ kernel->configure(compile_context, inputs_vector.at(i), offset, output);
offset += inputs_vector.at(i)->info()->dimension(_axis);
_concat_kernels.emplace_back(std::move(kernel));
}
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index 02927e8..68c0fb6 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -28,8 +28,14 @@
void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
DataLayout data_layout)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
+}
+
+void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
auto k = arm_compute::support::cpp14::make_unique<CLConvertFullyConnectedWeightsKernel>();
- k->configure(input, output, original_input_shape, data_layout);
+ k->configure(compile_context, input, output, original_input_shape, data_layout);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index f09585e..2b0d7d5 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,7 +32,7 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -40,10 +40,16 @@
void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value);
+}
+
+void CLConvolution3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
+ uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLConvolution3x3Kernel>();
- k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
template <unsigned int matrix_size>
@@ -56,6 +62,13 @@
void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value);
+}
+
+template <unsigned int matrix_size>
+void CLConvolutionSquare<matrix_size>::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
+ uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON(conv == nullptr);
std::array<int16_t, matrix_size> conv_col{ 0 };
@@ -75,17 +88,17 @@
scale = calculate_matrix_scale(conv, matrix_size);
}
- _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
- _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
- _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+ _kernel_hor.configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+ _kernel_vert.configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
+ _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
// Allocate intermediate buffer
_tmp.allocator()->allocate();
}
else
{
- _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
- _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+ _kernel.configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+ _border_handler.configure(compile_context, input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
}
}
@@ -113,8 +126,14 @@
void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, rows, cols, scale, border_mode, constant_border_value);
+}
+
+void CLConvolutionRectangle::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale,
+ BorderMode border_mode, uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLConvolutionRectangleKernel>();
- k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index c271f50..b6e1413 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,6 +46,13 @@
void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+}
+
+void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
enable_fast_math, num_groups));
@@ -57,7 +64,7 @@
{
ARM_COMPUTE_ERROR_ON(num_groups != 1);
auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
+ f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math);
_function = std::move(f);
break;
}
@@ -65,21 +72,21 @@
{
ARM_COMPUTE_ERROR_ON(num_groups != 1);
auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
- f->configure(input, weights, biases, output, conv_info, act_info);
+ f->configure(compile_context, input, weights, biases, output, conv_info, act_info);
_function = std::move(f);
break;
}
case ConvolutionMethod::GEMM:
{
auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
+ f->configure(compile_context, input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
_function = std::move(f);
break;
}
case ConvolutionMethod::FFT:
{
auto f = arm_compute::support::cpp14::make_unique<CLFFTConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, act_info);
+ f->configure(compile_context, input, weights, biases, output, conv_info, act_info);
_function = std::move(f);
break;
}
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index d1b7926..4c5d62a 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,7 +29,7 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -37,8 +37,13 @@
void CLCopy::configure(ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLCopy::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLCopyKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index 5e1278d..17fc80e 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,11 +21,10 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "arm_compute/runtime/CL/functions/CLCropResize.h"
#include "arm_compute/core/CL/CLHelpers.h"
-
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLCropResize.h"
#include <cstddef>
@@ -51,120 +50,10 @@
const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
output->info()->set_tensor_shape(out_shape);
}
-
-inline void run_crop(const ICLTensor *input, ICLTensor *output, uint32_t batch_index, Coordinates start, Coordinates end, float extrapolation_value)
-{
- bool is_width_flipped = end[0] < start[0];
- bool is_height_flipped = end[1] < start[1];
- /** The number of rows out of bounds at the start and end of output. */
- std::array<int32_t, 2> rows_out_of_bounds{ 0 };
- /** The number of columns out of bounds at the start and end of output. */
- std::array<int32_t, 2> cols_out_of_bounds{ 0 };
- if(is_height_flipped)
- {
- rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(start[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
- rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
- }
- else
- {
- rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
- rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(end[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
- }
- if(is_width_flipped)
- {
- cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(start[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
- cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
- }
- else
- {
- cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
- cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(end[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
- }
-
- Window full_window = calculate_max_window(*output->info());
-
- // Full output window:
- // --------------------------------
- // | Out of bounds |
- // | rows before |
- // |------------------------------|
- // | Out of | In | Out of |
- // | bounds | bounds | bounds |
- // | cols | elements | cols |
- // | before | copied | after |
- // | | from input | |
- // |------------------------------|
- // | Out of bounds |
- // | rows after |
- // |------------------------------|
- // Use a separate output window for each section of the full output window.
- // Fill all output rows that have no elements that are within the input bounds
- // with the extrapolation value using memset.
- // First for the rows before the in bounds rows.
- if(rows_out_of_bounds[0] > 0)
- {
- Window slice_fill_rows_before(full_window);
- slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
- auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
- kernel->configure(output, extrapolation_value, &slice_fill_rows_before);
- CLScheduler::get().enqueue(*kernel);
- }
-
- Window slice_in(full_window);
- slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], output->info()->dimension(2) - rows_out_of_bounds[1], 1));
- slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], output->info()->dimension(1) - cols_out_of_bounds[1], 1));
-
- int rows_in_bounds = static_cast<int32_t>(output->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
- if(rows_in_bounds > 0)
- {
- // Fill all elements that share a row with an in bounds element with the extrapolation value.
- if(cols_out_of_bounds[0] > 0)
- {
- Window slice_fill_cols_before(slice_in);
- slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
- auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
- kernel->configure(output, extrapolation_value, &slice_fill_cols_before);
- CLScheduler::get().enqueue(*kernel);
- }
-
- if(cols_out_of_bounds[1] > 0)
- {
- Window slice_fill_cols_after(slice_in);
- slice_fill_cols_after.set(1, Window::Dimension(output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1), 1));
- auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
- kernel->configure(output, extrapolation_value, &slice_fill_cols_after);
- CLScheduler::get().enqueue(*kernel);
- }
-
- // Copy all elements within the input bounds from the input tensor.
- int cols_in_bounds = static_cast<int32_t>(output->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
- if(cols_in_bounds > 0)
- {
- Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
- is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
- Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
- is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
- auto kernel = arm_compute::support::cpp14::make_unique<CLCropKernel>();
-
- kernel->configure(input, output, start_in, end_in, batch_index, extrapolation_value, &slice_in);
- CLScheduler::get().enqueue(*kernel);
- }
- }
-
- // Fill all rows after the in bounds elements with the extrapolation value.
- if(rows_out_of_bounds[1] > 0)
- {
- Window slice_fill_rows_after(full_window);
- slice_fill_rows_after.set(2, Window::Dimension(output->info()->dimension(2) - rows_out_of_bounds[1], output->info()->dimension(2), 1));
- auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
- kernel->configure(output, extrapolation_value, &slice_fill_rows_after);
- CLScheduler::get().enqueue(*kernel);
- }
-}
} // namespace
CLCropResize::CLCropResize()
- : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results()
+ : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_kernels()
{
}
@@ -190,9 +79,18 @@
void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
InterpolationPolicy method, float extrapolation_value)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value);
+}
+
+void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
+ InterpolationPolicy method, float extrapolation_value)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind);
ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+ TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
+ auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32);
+
_num_boxes = boxes->info()->tensor_shape()[1];
TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
@@ -210,7 +108,13 @@
// - A scale function is used to resize the cropped image to the size specified by crop_size.
// - A tensor is required to hold the final scaled image before it is copied into the 4D output
// that will hold all final cropped and scaled 3D images using CLCopyKernel.
- for(unsigned int i = 0; i < _num_boxes; ++i)
+
+ // The contents of _boxes and _box_ind are required to calculate the shape
+ // of the initial cropped image and thus are required to configure the
+ // kernels used for cropping and scaling.
+ _boxes->map(CLScheduler::get().queue());
+ _box_ind->map(CLScheduler::get().queue());
+ for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
{
auto crop_tensor = support::cpp14::make_unique<CLTensor>();
TensorInfo crop_result_info(1, DataType::F32);
@@ -223,44 +127,148 @@
scaled_result_info.set_data_layout(DataLayout::NHWC);
scale_tensor->allocator()->init(scaled_result_info);
_scaled_results.emplace_back(std::move(scale_tensor));
+
+ // Size of the crop box in _boxes has to be given before the configure
+ uint32_t batch_index;
+ Coordinates start{};
+ Coordinates end{};
+ configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index);
+
+ auto scale_kernel = support::cpp14::make_unique<CLScale>();
+ scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT);
+ _scale.emplace_back(std::move(scale_kernel));
+
+ Window win = calculate_max_window(*_output->info());
+ win.set(3, Window::Dimension(num_box, num_box + 1, 1));
+
+ auto copy_kernel = support::cpp14::make_unique<CLCopyKernel>();
+ copy_kernel->configure(compile_context, _scaled_results[num_box].get(), _output, PaddingList(), &win);
+ _copy.emplace_back(std::move(copy_kernel));
+
+ _crop_results[num_box]->allocator()->allocate();
+ _scaled_results[num_box]->allocator()->allocate();
+
+ bool is_width_flipped = end[0] < start[0];
+ bool is_height_flipped = end[1] < start[1];
+ /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */
+ std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+ /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */
+ std::array<int32_t, 2> cols_out_of_bounds{ 0 };
+ if(is_height_flipped)
+ {
+ rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
+ rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
+ }
+ else
+ {
+ rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
+ rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
+ }
+ if(is_width_flipped)
+ {
+ cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
+ cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
+ }
+ else
+ {
+ cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
+ cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
+ }
+
+ Window full_window = calculate_max_window(*_crop_results[num_box].get()->info());
+
+ // Full _crop_results[num_box].get() window:
+ // --------------------------------
+ // | Out of bounds |
+ // | rows before |
+ // |------------------------------|
+ // | Out of | In | Out of |
+ // | bounds | bounds | bounds |
+ // | cols | elements | cols |
+ // | before | copied | after |
+ // | | from input | |
+ // |------------------------------|
+ // | Out of bounds |
+ // | rows after |
+ // |------------------------------|
+ // Use a separate _crop_results[num_box].get() window for each section of the full _crop_results[num_box].get() window.
+ // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds
+ // with the extrapolation value using memset.
+ // First for the rows before the in bounds rows.
+ if(rows_out_of_bounds[0] > 0)
+ {
+ Window slice_fill_rows_before(full_window);
+ slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before);
+ _internal_kernels.push_back(std::move(kernel));
+ }
+
+ Window slice_in(full_window);
+ slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
+ slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
+
+ int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
+ if(rows_in_bounds > 0)
+ {
+ // Fill all elements that share a row with an in bounds element with the extrapolation value.
+ if(cols_out_of_bounds[0] > 0)
+ {
+ Window slice_fill_cols_before(slice_in);
+ slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before);
+ _internal_kernels.push_back(std::move(kernel));
+ }
+
+ if(cols_out_of_bounds[1] > 0)
+ {
+ Window slice_fill_cols_after(slice_in);
+ slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after);
+ _internal_kernels.push_back(std::move(kernel));
+ }
+
+ // Copy all elements within the input bounds from the input tensor.
+ int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
+ if(cols_in_bounds > 0)
+ {
+ Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+ is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
+ Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+ is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+ auto kernel = arm_compute::support::cpp14::make_unique<CLCropKernel>();
+
+ kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in);
+ _internal_kernels.push_back(std::move(kernel));
+ }
+ }
+
+ // Fill all rows after the in bounds elements with the extrapolation value.
+ if(rows_out_of_bounds[1] > 0)
+ {
+ Window slice_fill_rows_after(full_window);
+ slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1));
+ auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+ kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after);
+ _internal_kernels.push_back(std::move(kernel));
+ }
}
+ _boxes->unmap(CLScheduler::get().queue());
+ _box_ind->unmap(CLScheduler::get().queue());
+ CLScheduler::get().sync();
}
void CLCropResize::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
- // The contents of _boxes and _box_ind are required to calculate the shape
- // of the initial cropped image and thus are required to configure the
- // kernels used for cropping and scaling.
- _boxes->map(CLScheduler::get().queue());
- _box_ind->map(CLScheduler::get().queue());
- for(unsigned int i = 0; i < _num_boxes; ++i)
+
+ for(unsigned int i = 0; i < _internal_kernels.size(); ++i)
{
- // Size of the crop box in _boxes and thus the shape of _crop_results[i]
- // may not be known until run-time and so the kernels cannot be configured until then.
- uint32_t batch_index;
- Coordinates start{};
- Coordinates end{};
- configure_crop(_input, _boxes, _box_ind, _crop_results[i].get(), i, start, end, batch_index);
-
- auto scale_kernel = support::cpp14::make_unique<CLScale>();
- scale_kernel->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT);
- _scale.emplace_back(std::move(scale_kernel));
-
- Window win = calculate_max_window(*_output->info());
- win.set(3, Window::Dimension(i, i + 1, 1));
-
- auto copy_kernel = support::cpp14::make_unique<CLCopyKernel>();
- copy_kernel->configure(_scaled_results[i].get(), _output, PaddingList(), &win);
- _copy.emplace_back(std::move(copy_kernel));
-
- _crop_results[i]->allocator()->allocate();
- _scaled_results[i]->allocator()->allocate();
-
- run_crop(_input, _crop_results[i].get(), batch_index, start, end, _extrapolation_value);
+ CLScheduler::get().enqueue(*(_internal_kernels[i]));
}
- _boxes->unmap(CLScheduler::get().queue());
- _box_ind->unmap(CLScheduler::get().queue());
+
CLScheduler::get().sync();
for(auto &kernel : _scale)
{
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 7aa7714..62e7d9a 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,6 +44,12 @@
void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
const WeightsInfo &weights_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info);
+}
+
+void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
+ const WeightsInfo &weights_info)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
@@ -51,14 +57,14 @@
case DeconvolutionMethod::DIRECT:
{
auto f = arm_compute::support::cpp14::make_unique<CLDirectDeconvolutionLayer>();
- f->configure(input, weights, bias, output, deconv_info, weights_info);
+ f->configure(compile_context, input, weights, bias, output, deconv_info, weights_info);
_function = std::move(f);
break;
}
case DeconvolutionMethod::GEMM:
{
auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
- f->configure(input, weights, bias, output, deconv_info);
+ f->configure(compile_context, input, weights, bias, output, deconv_info);
_function = std::move(f);
break;
}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index eaf7c66..be2d120 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,11 +44,16 @@
void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
+}
+
+void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_output = output;
- _memset.configure(_output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
- _upsample.configure(input, _output, info);
+ _memset.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
+ _upsample.configure(compile_context, input, _output, info);
}
void CLDeconvolutionLayerUpsample::run()
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index dbf71ac..b848f98 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,8 +32,13 @@
{
void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
+}
+
+void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+{
auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
- k->configure(input, output, policy, shift);
+ k->configure(compile_context, input, output, policy, shift);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 1581dd9..89e5faa 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,8 +32,13 @@
{
void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
+}
+
+void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+{
auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceLayerKernel>();
- k->configure(input, output, block_shape);
+ k->configure(compile_context, input, output, block_shape);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index e717f79..b1e9fe7 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,7 +31,7 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
@@ -117,33 +117,6 @@
}
} // namespace
-CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
- : _func(std::move(memory_manager))
-{
-}
-
-void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
- ActivationLayerInfo act_info, const Size2D &dilation)
-{
- _func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-}
-
-Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
-{
- return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation);
-}
-
-void CLDepthwiseConvolutionLayer3x3::run()
-{
- _func.run();
-}
-
-void CLDepthwiseConvolutionLayer3x3::prepare()
-{
- _func.prepare();
-}
-
CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_dwc_native_kernel(),
@@ -167,6 +140,13 @@
void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
+ ICLTensor *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
weights->info(),
@@ -193,11 +173,11 @@
_memory_group.manage(&_permuted_output);
// Configure the function to transform the input tensor from NCHW -> NHWC
- _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+ _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U));
_permuted_input.info()->set_data_layout(DataLayout::NHWC);
// Configure the function to transform the weights tensor from IHW -> HWI
- _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+ _permute_weights_to_nhwc.configure(compile_context, weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
_permuted_weights.info()->set_data_layout(DataLayout::NHWC);
// Set output quantization info before dwc kernel configure
@@ -226,7 +206,7 @@
dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
DWCKernelInfo dwc_info;
dwc_info.activation_info = act_info;
- _dwc_native_kernel.configure(input_to_use, weights_to_use, biases, output_to_use,
+ _dwc_native_kernel.configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
output_multipliers_to_use, output_shifts_to_use);
@@ -236,7 +216,7 @@
// Configure the function to transform the convoluted output to NCHW format
_permuted_output.info()->set_data_layout(DataLayout::NCHW);
- _permute_output_to_nchw.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+ _permute_output_to_nchw.configure(compile_context, &_permuted_output, output, PermutationVector(1U, 2U, 0U));
_permuted_output.allocator()->allocate();
}
@@ -386,11 +366,18 @@
void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
+ ICLTensor *output,
+ const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+{
const GPUTarget gpu_target = CLScheduler::get().target();
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer3x3::validate(input->info(),
+ ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayerInternal3x3::validate(input->info(),
weights->info(),
biases != nullptr ? biases->info() : nullptr,
output->info(),
@@ -429,11 +416,11 @@
_memory_group.manage(&_permuted_output);
// Configure the function to transform the input tensor from NHWC -> NCHW
- _permute_input_to_nchw.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permute_input_to_nchw.configure(compile_context, input, &_permuted_input, PermutationVector(1U, 2U, 0U));
_permuted_input.info()->set_data_layout(DataLayout::NCHW);
// Configure the function to transform the weights tensor from HWI -> IHW
- _permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permute_weights_to_nchw.configure(compile_context, weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
_permuted_weights.info()->set_data_layout(DataLayout::NCHW);
_permuted_output.info()->set_quantization_info(output->info()->quantization_info());
@@ -447,7 +434,7 @@
{
if(_needs_weights_reshape)
{
- _reshape_weights.configure(weights, &_permuted_weights, info);
+ _reshape_weights.configure(compile_context, weights, &_permuted_weights, info);
weights_to_use = &_permuted_weights;
}
_kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
@@ -473,7 +460,7 @@
// Configure kernel
_kernel->set_target(gpu_target);
- _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier,
+ _kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier,
act_info, dilation, output_multipliers_to_use, output_shifts_to_use);
if(_is_quantized)
@@ -487,7 +474,7 @@
{
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
_permuted_output.info()->set_data_layout(DataLayout::NCHW);
- _permute_output_to_nhwc.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+ _permute_output_to_nhwc.configure(compile_context, &_permuted_output, output, PermutationVector(2U, 0U, 1U));
// Allocate tensors
_permuted_input.allocator()->allocate();
@@ -499,7 +486,7 @@
{
zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
}
- _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+ _border_handler.configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
}
Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
@@ -575,6 +562,14 @@
void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
ActivationLayerInfo act_info, const Size2D &dilation)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info, const Size2D &dilation)
+{
const GPUTarget gpu_target = CLScheduler::get().target();
_depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info,
dilation, gpu_target);
@@ -582,12 +577,12 @@
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_func_3x3.set_memory_group(_memory_manager);
- _func_3x3.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ _func_3x3.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
break;
case DepthwiseConvolutionFunction::GENERIC:
{
_func_generic.set_memory_group(_memory_manager);
- _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ _func_generic.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
}
break;
default:
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index cdfdfc7..362b36c 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,14 +24,19 @@
#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLDequantizationLayerKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
index ae49996..68d3752 100644
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ b/src/runtime/CL/functions/CLDerivative.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
#include "arm_compute/core/PixelValue.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,13 @@
void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
+}
+
+void CLDerivative::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLDerivativeKernel>();
- k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
index 59c5ea5..05351a9 100644
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ b/src/runtime/CL/functions/CLDilate.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
#include "arm_compute/core/PixelValue.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,13 @@
void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLDilate::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLDilateKernel>();
- k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index bfc6ff1..6e9782f 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,19 +39,26 @@
void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
+}
+
+void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
// Set GPU target
_direct_conv_kernel.set_target(CLScheduler::get().target());
// Configure direct convolution
- _direct_conv_kernel.configure(input, weights, biases, output, conv_info);
+ _direct_conv_kernel.configure(compile_context, input, weights, biases, output, conv_info);
// Configure border handler
PixelValue &&zero_value(0.f);
if(is_data_type_quantized_asymmetric(input->info()->data_type()))
{
- zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
+ zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
}
- _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+ _input_border_handler.configure(compile_context, input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
// Tune kernels
CLScheduler::get().tune_kernel_static(_direct_conv_kernel);
@@ -61,7 +68,7 @@
//Configure Activation Layer
if(_is_activationlayer_enabled)
{
- _activationlayer_function.configure(output, nullptr, act_info);
+ _activationlayer_function.configure(compile_context, output, nullptr, act_info);
}
}
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index b8089d8..da16bed 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,7 +53,7 @@
const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
const DataLayout data_layout = input->data_layout();
@@ -87,10 +87,10 @@
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
- unsigned int deconv_pad_x = 0;
- unsigned int deconv_pad_y = 0;
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
+ unsigned int deconv_pad_x = 0;
+ unsigned int deconv_pad_y = 0;
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
@@ -104,14 +104,20 @@
void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
const WeightsInfo &weights_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info);
+}
+
+void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
const unsigned int pad_left = info.pad_left();
const unsigned int pad_right = info.pad_right();
const unsigned int pad_top = info.pad_top();
const unsigned int pad_bottom = info.pad_bottom();
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
const DataLayout data_layout = input->info()->data_layout();
@@ -121,7 +127,7 @@
_original_weights = weights;
_flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
_weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
- _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
+ _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
@@ -146,14 +152,14 @@
unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
deconv_pad_x -= deconv_pad_left + deconv_pad_right;
ARM_COMPUTE_ERROR_ON((deconv_pad_x % 2) != 0);
- deconv_pad_left += deconv_pad_x / 2;
+ deconv_pad_left += deconv_pad_x / 2;
deconv_pad_right += deconv_pad_x / 2;
unsigned int deconv_pad_top = pad_bottom > pad_top ? pad_bottom - pad_top : 0;
unsigned int deconv_pad_bottom = pad_top > pad_bottom ? pad_top - pad_bottom : 0;
deconv_pad_y -= deconv_pad_top + deconv_pad_bottom;
ARM_COMPUTE_ERROR_ON((deconv_pad_y % 2) != 0);
- deconv_pad_top += deconv_pad_y / 2;
+ deconv_pad_top += deconv_pad_y / 2;
deconv_pad_bottom += deconv_pad_y / 2;
TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
@@ -162,11 +168,11 @@
// configure scale function
const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
- _scale_f.configure(input, &_scaled_output, upsample_info);
+ _scale_f.configure(compile_context, input, &_scaled_output, upsample_info);
// Setup the function to convolve the upscaled output
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+ _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
_scaled_output.allocator()->allocate();
// Setup flip axis data
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
index b9ebf69..ce61532 100644
--- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,8 +32,13 @@
{
void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLRsqrtLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
- k->configure(input, output, ElementWiseUnary::RSQRT);
+ k->configure(compile_context, input, output, ElementWiseUnary::RSQRT);
_kernel = std::move(k);
}
Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -43,8 +48,13 @@
void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLExpLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
- k->configure(input, output, ElementWiseUnary::EXP);
+ k->configure(compile_context, input, output, ElementWiseUnary::EXP);
_kernel = std::move(k);
}
Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -54,8 +64,13 @@
void CLNegLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLNegLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
- k->configure(input, output, ElementWiseUnary::NEG);
+ k->configure(compile_context, input, output, ElementWiseUnary::NEG);
_kernel = std::move(k);
}
Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -65,8 +80,13 @@
void CLSinLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLSinLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
- k->configure(input, output, ElementWiseUnary::SIN);
+ k->configure(compile_context, input, output, ElementWiseUnary::SIN);
_kernel = std::move(k);
}
Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -76,8 +96,13 @@
void CLAbsLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLAbsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
- k->configure(input, output, ElementWiseUnary::ABS);
+ k->configure(compile_context, input, output, ElementWiseUnary::ABS);
_kernel = std::move(k);
}
Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -86,8 +111,13 @@
}
void CLLogLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLLogLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
- k->configure(input, output, ElementWiseUnary::LOG);
+ k->configure(compile_context, input, output, ElementWiseUnary::LOG);
_kernel = std::move(k);
}
Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -97,8 +127,13 @@
void CLRoundLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLRoundLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
- k->configure(input, output, ElementWiseUnary::ROUND);
+ k->configure(compile_context, input, output, ElementWiseUnary::ROUND);
_kernel = std::move(k);
}
Status CLRoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 8317e0d..20e9545 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,7 +33,7 @@
{
namespace
{
-void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
{
if(output->info()->dimension(0) > 1)
{
@@ -41,102 +41,137 @@
if(broadcasted_info->info()->dimension(0) == 1)
{
- border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+ border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE);
}
}
}
} // namespace
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
+}
+
+void CLArithmeticAddition::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
{
auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
- k->configure(ArithmeticOperation::ADD, input1, input2, output, policy);
+ k->configure(compile_context, ArithmeticOperation::ADD, input1, input2, output, policy, act_info);
_kernel = std::move(k);
- configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+ configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
}
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
{
- return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy);
+ return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy, act_info);
}
-void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
+}
+
+void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
{
auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
- k->configure(ArithmeticOperation::SUB, input1, input2, output, policy);
+ k->configure(compile_context, ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
_kernel = std::move(k);
- configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+ configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
}
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(policy);
- return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy);
+ return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
}
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLArithmeticDivision::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
{
auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
- k->configure(ArithmeticOperation::DIV, input1, input2, output);
+ k->configure(compile_context, ArithmeticOperation::DIV, input1, input2, output, act_info);
_kernel = std::move(k);
- configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+ configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
}
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
- return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output);
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output, act_info);
}
-void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
{
auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
- k->configure(ArithmeticOperation::MAX, input1, input2, output);
+ k->configure(compile_context, ArithmeticOperation::MAX, input1, input2, output, act_info);
_kernel = std::move(k);
- configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+ configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
}
-Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
- return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output, act_info);
}
-void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
{
auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
- k->configure(ArithmeticOperation::MIN, input1, input2, output);
+ k->configure(compile_context, ArithmeticOperation::MIN, input1, input2, output, act_info);
_kernel = std::move(k);
- configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+ configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
}
-Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
- return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output, act_info);
}
-void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
{
auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
- k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+ k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
_kernel = std::move(k);
- configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+ configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
}
-Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
- return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
}
-void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
{
auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
- k->configure(ArithmeticOperation::POWER, input1, input2, output);
+ k->configure(compile_context, ArithmeticOperation::POWER, input1, input2, output, act_info);
_kernel = std::move(k);
- configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+ configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
}
-Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
- return CLArithmeticOperationKernel::validate(ArithmeticOperation::POWER, input1, input2, output);
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::POWER, input1, input2, output, act_info);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
index a0663b7..e1bd7e6 100644
--- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp
+++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -89,9 +89,14 @@
void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
{
- _histogram_kernel.configure(input, &_hist);
- _border_histogram_kernel.configure(input, &_hist);
- _map_histogram_kernel.configure(input, &_cd_lut, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLEqualizeHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output)
+{
+ _histogram_kernel.configure(compile_context, input, &_hist);
+ _border_histogram_kernel.configure(compile_context, input, &_hist);
+ _map_histogram_kernel.configure(compile_context, input, &_cd_lut, output);
}
void CLEqualizeHistogram::run()
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
index eb1f6e4..8106148 100644
--- a/src/runtime/CL/functions/CLErode.cpp
+++ b/src/runtime/CL/functions/CLErode.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
#include "arm_compute/core/PixelValue.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,13 @@
void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLErode::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLErodeKernel>();
- k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index 49b5a2a..c3922f5 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,6 +37,11 @@
void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
+}
+
+void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
@@ -57,7 +62,7 @@
TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
_digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
_memory_group.manage(&_digit_reversed_input);
- _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+ _digit_reverse_kernel.configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
// Create and configure FFT kernels
unsigned int Nx = 1;
@@ -72,7 +77,7 @@
fft_kernel_info.radix = radix_for_stage;
fft_kernel_info.Nx = Nx;
fft_kernel_info.is_first_stage = (i == 0);
- _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+ _fft_kernels[i].configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
Nx *= radix_for_stage;
}
@@ -83,7 +88,7 @@
FFTScaleKernelInfo scale_config;
scale_config.scale = static_cast<float>(N);
scale_config.conjugate = config.direction == FFTDirection::Inverse;
- is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+ is_c2r ? _scale_kernel.configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
}
// Allocate tensors
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index 165e784..2482ea9 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,21 +36,26 @@
void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
+}
+
+void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
// Setup first pass
FFT1DInfo first_pass_config;
- first_pass_config.axis = config.axes.first;
+ first_pass_config.axis = config.axis0;
first_pass_config.direction = config.direction;
_memory_group.manage(&_first_pass_tensor);
- _first_pass_func.configure(input, &_first_pass_tensor, first_pass_config);
+ _first_pass_func.configure(compile_context, input, &_first_pass_tensor, first_pass_config);
// Setup second pass
FFT1DInfo second_pass_config;
- second_pass_config.axis = config.axes.second;
+ second_pass_config.axis = config.axis1;
second_pass_config.direction = config.direction;
- _second_pass_func.configure(&_first_pass_tensor, output, second_pass_config);
+ _second_pass_func.configure(compile_context, &_first_pass_tensor, output, second_pass_config);
_first_pass_tensor.allocator()->allocate();
}
@@ -63,13 +68,13 @@
// Validate first pass
FFT1DInfo first_pass_config;
- first_pass_config.axis = config.axes.first;
+ first_pass_config.axis = config.axis0;
first_pass_config.direction = config.direction;
ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(input, &first_pass_tensor, first_pass_config));
// Validate second pass
FFT1DInfo second_pass_config;
- second_pass_config.axis = config.axes.second;
+ second_pass_config.axis = config.axis1;
second_pass_config.direction = config.direction;
ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index afb1cab..ff439cc 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,6 +98,12 @@
void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
+}
+
+void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
_original_weights = weights;
_original_bias = biases;
@@ -121,7 +127,7 @@
// Permute bias
if(biases != nullptr)
{
- _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
+ _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
_permuted_bias.info()->set_data_layout(DataLayout::NCHW);
}
@@ -131,11 +137,11 @@
{
_memory_group.manage(&_permuted_input);
// Configure the function to transform the input tensor from NHWC -> NCHW
- _permute_input_func.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permute_input_func.configure(compile_context, input, &_permuted_input, PermutationVector(1U, 2U, 0U));
_permuted_input.info()->set_data_layout(DataLayout::NCHW);
// Configure the function to transform the weights tensor from HWI -> IHW
- _permute_weights_func.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permute_weights_func.configure(compile_context, weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
_permuted_weights.info()->set_data_layout(DataLayout::NCHW);
input_to_use = &_permuted_input;
@@ -145,20 +151,20 @@
// Flip weights
_flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
_flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
- _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
+ _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis);
// Pad weights
const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
- _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
+ _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w);
// Transform weights
_transform_weights_func = support::cpp14::make_unique<CLFFT2D>();
- _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
+ _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo());
// Pad input
const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
_memory_group.manage(&_padded_input);
- _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
+ _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in);
if(_needs_permute)
{
_permuted_input.allocator()->allocate();
@@ -166,17 +172,17 @@
// Transform input
_memory_group.manage(&_transformed_input);
- _transform_input_func.configure(&_padded_input, &_transformed_input, FFT2DInfo());
+ _transform_input_func.configure(compile_context, &_padded_input, &_transformed_input, FFT2DInfo());
_padded_input.allocator()->allocate();
// Perform product
_memory_group.manage(&_output_product);
- _prod_func.configure(&_transformed_input, &_transformed_weights, &_output_product);
+ _prod_func.configure(compile_context, &_transformed_input, &_transformed_weights, &_output_product);
_transformed_input.allocator()->allocate();
// Perform reduction
_memory_group.manage(&_output_reduced);
- _reduce_func.configure(&_output_product, &_output_reduced, 2, ReductionOperation::SUM);
+ _reduce_func.configure(compile_context, &_output_product, &_output_reduced, 2, ReductionOperation::SUM);
_output_product.allocator()->allocate();
// Transform output
@@ -184,7 +190,7 @@
FFT2DInfo itranform_info;
itranform_info.direction = FFTDirection::Inverse;
_itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
- _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
+ _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info);
_output_reduced.allocator()->allocate();
// Reshape output
@@ -206,7 +212,7 @@
output_to_use = &_permuted_output;
_memory_group.manage(&_permuted_output);
}
- _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+ _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
_itransformed_output.allocator()->allocate();
// Add bias
@@ -219,7 +225,7 @@
_memory_group.manage(&_permuted_output);
}
auto_init_if_empty(*output_to_use->info(), *_bias_output.info());
- _bias_add_func.configure(&_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
+ _bias_add_func.configure(compile_context, &_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
_bias_output.allocator()->allocate();
}
@@ -228,7 +234,7 @@
{
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
_permuted_output.info()->set_data_layout(DataLayout::NCHW);
- _permute_output_func.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+ _permute_output_func.configure(compile_context, &_permuted_output, output, PermutationVector(2U, 0U, 1U));
// Allocate tensors
_permuted_output.allocator()->allocate();
@@ -238,7 +244,7 @@
_is_activationlayer_enabled = act_info.enabled();
if(_is_activationlayer_enabled)
{
- _activation_layer_func.configure(output, nullptr, act_info);
+ _activation_layer_func.configure(compile_context, output, nullptr, act_info);
}
// Setup flip axis data
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index fe2a18c..f51abf0 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -55,6 +55,12 @@
void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, threshold, nonmax_suppression, corners, num_corners, border_mode, constant_border_value);
+}
+
+void CLFastCorners::configure(const CLCompileContext &compile_context, const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
+ unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
ARM_COMPUTE_ERROR_ON(nullptr == corners);
@@ -72,19 +78,19 @@
const bool update_number = (nullptr != _num_corners);
_memory_group.manage(&_output);
- _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, border_mode);
+ _fast_corners_kernel.configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode);
if(!_non_max)
{
- _copy_array_kernel.configure(&_output, update_number, _corners, &_num_buffer);
+ _copy_array_kernel.configure(compile_context, &_output, update_number, _corners, &_num_buffer);
}
else
{
_suppr.allocator()->init(tensor_info);
_memory_group.manage(&_suppr);
- _suppr_func.configure(&_output, &_suppr, border_mode);
- _copy_array_kernel.configure(&_suppr, update_number, _corners, &_num_buffer);
+ _suppr_func.configure(compile_context, &_output, &_suppr, border_mode);
+ _copy_array_kernel.configure(compile_context, &_suppr, update_number, _corners, &_num_buffer);
_suppr.allocator()->allocate();
}
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 035bb7c..7b96ed1 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,11 +30,15 @@
namespace arm_compute
{
-
void CLFill::configure(ICLTensor *tensor, PixelValue constant_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value);
+}
+
+void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, PixelValue constant_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
- k->configure(tensor, constant_value);
+ k->configure(compile_context, tensor, constant_value);
_kernel = std::move(k);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
index 54c096e..f9d7396 100644
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ b/src/runtime/CL/functions/CLFillBorder.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,7 +32,12 @@
void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), tensor, border_width, border_mode, constant_border_value);
+}
+
+void CLFillBorder::configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLFillBorderKernel>();
- k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value);
+ k->configure(compile_context, tensor, BorderSize(border_width), border_mode, constant_border_value);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index b372c35..9a247cc 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,14 +25,19 @@
#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLFlattenLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLFlattenLayerKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
CLScheduler::get().tune_kernel_static(*_kernel);
}
@@ -40,4 +45,4 @@
Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
return CLFlattenLayerKernel::validate(input, output);
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 4137071..44e1d39 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,14 +24,19 @@
#include "arm_compute/runtime/CL/functions/CLFloor.h"
#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLFloor::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLFloor::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLFloorKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index dcaa126..ecbac6f 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -29,7 +29,7 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <algorithm>
@@ -41,7 +41,7 @@
namespace
{
Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output,
- GEMMLowpOutputStageInfo &gemmlowp_output_stage)
+ GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info)
{
gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
gemmlowp_output_stage.gemmlowp_offset = 0;
@@ -53,13 +53,14 @@
// Configure output stage for quantized case
if(is_data_type_quantized_asymmetric(data_type))
{
- const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output.quantization_info().uniform();
+ const QuantizationInfo oq_info = output.quantization_info();
+ const UniformQuantizationInfo iq_unif = input.quantization_info().uniform();
+ const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform();
+ const UniformQuantizationInfo oq_unif = oq_info.uniform();
- const auto output_quant_info = (output.total_size() == 0) ? iq_info : oq_info;
+ const auto output_quant_info = (output.total_size() == 0) ? iq_unif : oq_unif;
- const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
+ const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
int output_multiplier = 0;
int output_shift = 0;
ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
@@ -68,6 +69,27 @@
PixelValue type_max{};
std::tie(type_min, type_max) = get_min_max(data_type);
+ if(activation_info.enabled())
+ {
+ switch(activation_info.activation())
+ {
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ type_min = PixelValue(oq_unif.offset);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ type_min = PixelValue(oq_unif.offset);
+ type_max = PixelValue(activation_info.a(), data_type, oq_info);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ type_min = PixelValue(activation_info.b(), data_type, oq_info);
+ type_max = PixelValue(activation_info.a(), data_type, oq_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Activation function not supported.");
+ break;
+ }
+ }
+
// Set the GEMMLowp output stage info
gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
@@ -84,7 +106,7 @@
Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info)
{
GEMMLowpOutputStageInfo gemmlowp_output_stage;
- ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
+ ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage, fc_info.activation_info));
const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
false, // is_b_reshaped
@@ -125,8 +147,13 @@
void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLFullyConnectedLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
@@ -141,10 +168,11 @@
_are_weights_reshaped(true), _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
{
}
-void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
+void CLFullyConnectedLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+ const FullyConnectedLayerInfo &fc_info)
{
GEMMLowpOutputStageInfo gemmlowp_output_stage;
- construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage);
+ construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage, fc_info.activation_info);
const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
false, // is_b_reshaped
@@ -155,7 +183,7 @@
gemmlowp_output_stage, // gemmlowp_output_stage
fc_info.fp_mixed_precision, // fp_mixed_precision
true, // broadcast_bias
- ActivationLayerInfo()); // activation_info
+ fc_info.activation_info); // activation_info
if(_is_quantized)
{
@@ -168,7 +196,7 @@
weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
// Configure gemmlowp function
- _mm_gemmlowp.configure(input, weights, bias, output, gemm_info);
+ _mm_gemmlowp.configure(compile_context, input, weights, bias, output, gemm_info);
// Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
input->info()->set_quantization_info(input_quantization_info);
@@ -177,11 +205,12 @@
else
{
// Configure matrix multiply kernel
- _mm_gemm.configure(input, weights, bias, output, 1.f, 1.f, gemm_info);
+ _mm_gemm.configure(compile_context, input, weights, bias, output, 1.f, 1.f, gemm_info);
}
}
-void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
+void CLFullyConnectedLayer::configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+ const FullyConnectedLayerInfo &fc_info)
{
ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
@@ -193,26 +222,33 @@
// Configure flatten kernel
_memory_group.manage(&_flatten_output);
- _flatten_layer.configure(input, &_flatten_output);
+ _flatten_layer.configure(compile_context, input, &_flatten_output);
// Configure matrix multiply kernel
- configure_mm(&_flatten_output, weights, bias, output, fc_info);
+ configure_mm(compile_context, &_flatten_output, weights, bias, output, fc_info);
// Allocate the output tensor for flatten once all the configure methods have been called
_flatten_output.allocator()->allocate();
}
-void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
+void CLFullyConnectedLayer::configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+ const FullyConnectedLayerInfo &fc_info)
{
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
// Configure matrix multiply kernel
- configure_mm(input, weights, bias, output, fc_info);
+ configure_mm(compile_context, input, weights, bias, output, fc_info);
}
void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
FullyConnectedLayerInfo fc_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info);
+}
+
+void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+ FullyConnectedLayerInfo fc_info)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
// Perform validate step
@@ -260,13 +296,13 @@
{
if(_weights_manager && _weights_manager->are_weights_managed(weights))
{
- _reshape_weights_managed_function.configure(weights);
+ _reshape_weights_managed_function.configure(compile_context, weights);
weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed_function));
}
else
{
// Reshape the weights
- _reshape_weights_function.configure(weights, &_reshape_weights_output);
+ _reshape_weights_function.configure(compile_context, weights, &_reshape_weights_output);
weights_to_use = &_reshape_weights_output;
}
}
@@ -276,7 +312,7 @@
{
if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
{
- _convert_weights_managed.configure(weights_to_use,
+ _convert_weights_managed.configure(compile_context, weights_to_use,
input->info()->tensor_shape(),
fc_info.weights_trained_layout);
weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_convert_weights_managed));
@@ -284,7 +320,7 @@
else
{
// Convert weights
- _convert_weights.configure(weights_to_use,
+ _convert_weights.configure(compile_context, weights_to_use,
&_converted_weights_output,
input->info()->tensor_shape(),
fc_info.weights_trained_layout);
@@ -297,12 +333,12 @@
if(_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer without batches
- configure_conv_fc(input, weights_to_use, biases, output, fc_info);
+ configure_conv_fc(compile_context, input, weights_to_use, biases, output, fc_info);
}
else
{
// Fully Connected layer after a Fully Connected Layer without batches
- configure_fc_fc(input, weights_to_use, biases, output, fc_info);
+ configure_fc_fc(compile_context, input, weights_to_use, biases, output, fc_info);
}
}
@@ -313,6 +349,8 @@
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(input->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
+ && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
bool is_fc_after_conv = true;
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 72dd27e..6deecdc 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,7 +41,15 @@
const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
float epsilon, FuseBatchNormalizationType fbn_type)
{
- _fuse_bn_kernel.configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+}
+
+void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
+ ICLTensor *fused_weights, ICLTensor *fused_bias,
+ const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
+ float epsilon, FuseBatchNormalizationType fbn_type)
+{
+ _fuse_bn_kernel.configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index fc56a0b..8466024 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -39,6 +39,7 @@
#include "arm_compute/core/utils/misc/Cast.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
#include "arm_compute/runtime/ITensorAllocator.h"
namespace arm_compute
@@ -61,79 +62,27 @@
_original_b(nullptr),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
- _gemm_type(GEMMType::NATIVE)
+ _gemm_kernel_type(CLGEMMKernelType::NATIVE_V1)
{
}
-CLGEMM::GEMMType CLGEMM::select_gemm_type(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+CLGEMMKernelType CLGEMM::select_gemm_kernel(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run)
{
- GEMMType gemm_type = GEMMType::RESHAPED_V1;
+ std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get());
- if(gpu_target_is_in(gpu_target, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
- GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72,
- GPUTarget::G76, GPUTarget::G77))
- {
- if(data_type == DataType::F32)
- {
- if((m > 1) && (n < 16))
- {
- gemm_type = GEMMType::RESHAPED_V1;
- }
- else if(m == 1)
- {
- gemm_type = GEMMType::RESHAPED_ONLY_RHS;
- }
- else
- {
- // COMPMID-852
- if((k > 256) && (m > 4) && reshape_b_only_on_first_run)
- {
- constexpr float alpha = 3.2f;
- constexpr float fact0 = 1.51f;
- constexpr float fact1 = 1.66f;
- constexpr float ops = 12.0f;
- const float scale = k > 1024 ? 1.07f : 1.0f;
- gemm_type = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
- }
- else
- {
- gemm_type = GEMMType::NATIVE;
- }
- }
+ CLGEMMKernelSelectionParams params;
+ params.m = m;
+ params.n = n;
+ params.k = k;
+ params.is_rhs_constant = reshape_b_only_on_first_run;
+ params.data_type = data_type;
- const auto workload = static_cast<float>((m * n) / 20.0f);
-
- gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
- }
- else
- {
- if((m == 1) || (!reshape_b_only_on_first_run))
- {
- if((n > k) && gpu_target_is_in(gpu_target, GPUTarget::G71))
- {
- gemm_type = GEMMType::NATIVE;
- }
- else
- {
- gemm_type = GEMMType::RESHAPED_ONLY_RHS;
- }
- }
- else
- {
- gemm_type = GEMMType::RESHAPED_V2;
- }
- }
- }
- else
- {
- // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
- gemm_type = ((m != 1) && reshape_b_only_on_first_run) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
- }
-
- return gemm_type;
+ return gemm_kernel->select_kernel(params);
}
-void CLGEMM::configure_native(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_native_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
+ const GEMMInfo &gemm_info)
{
const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
const unsigned int n = b->info()->dimension(0);
@@ -146,13 +95,14 @@
GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());
// Configure and tune matrix multiply kernel
- _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+ _mm_kernel.configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
// Tune kernel statically
CLScheduler::get().tune_kernel_static(_mm_kernel);
}
-void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
+ const GEMMInfo &gemm_info)
{
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
@@ -200,22 +150,22 @@
}
// Configure interleave kernel
- _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
+ _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
// Configure transpose kernel
ICLTensor *reshaped_rhs = &_tmp_b;
if(_weights_manager && _weights_manager->are_weights_managed(b))
{
- _reshape_rhs_kernel_managed.configure(b, rhs_info);
+ _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
}
else
{
- _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+ _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
}
// Configure and tune matrix multiply kernel
- _mm_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+ _mm_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
CLScheduler::get().tune_kernel_static(_mm_kernel);
@@ -228,7 +178,8 @@
}
}
-void CLGEMM::configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
+ const GEMMInfo &gemm_info)
{
DataType data_type = a->info()->data_type();
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
@@ -275,21 +226,21 @@
// Configure lhs_info and rhs_info
std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
- _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+ _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
ICLTensor *reshaped_rhs = &_tmp_b;
if(_weights_manager && _weights_manager->are_weights_managed(b))
{
- _reshape_rhs_kernel_managed.configure(b, rhs_info);
+ _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
}
else
{
- _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+ _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
}
// Configure and tune matrix multiply kernel
- _mm_reshaped_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+ _mm_reshaped_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
// Allocate intermediate tensors
_tmp_a.allocator()->allocate();
@@ -300,7 +251,8 @@
}
}
-void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
+ const GEMMInfo &gemm_info)
{
DataType data_type = a->info()->data_type();
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
@@ -345,16 +297,16 @@
ICLTensor *reshaped_rhs = &_tmp_b;
if(_weights_manager && _weights_manager->are_weights_managed(b))
{
- _reshape_rhs_kernel_managed.configure(b, rhs_info);
+ _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
}
else
{
- _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+ _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
}
// Configure and tune matrix multiply kernel
- _mm_reshaped_only_rhs_kernel.configure(a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+ _mm_reshaped_only_rhs_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
if(!_reshape_b_only_on_first_run && use_mm_b)
{
@@ -362,7 +314,7 @@
}
}
-Status CLGEMM::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_UNUSED(alpha);
ARM_COMPUTE_UNUSED(output);
@@ -438,7 +390,7 @@
return Status{};
}
-Status CLGEMM::validate_reshaped_v2(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_UNUSED(alpha);
ARM_COMPUTE_UNUSED(output);
@@ -536,6 +488,11 @@
void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info);
+}
+
+void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
// Perform validation step
@@ -547,39 +504,38 @@
_original_b = b;
// Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
const unsigned int n = b->info()->dimension(0);
const unsigned int k = a->info()->dimension(0);
// Select GEMMType
- _gemm_type = select_gemm_type(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+ _gemm_kernel_type = select_gemm_kernel(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run);
const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
const ICLTensor *c_to_use = fuse_add_c ? c : nullptr;
- switch(_gemm_type)
+ switch(_gemm_kernel_type)
{
- case GEMMType::NATIVE:
+ case CLGEMMKernelType::NATIVE_V1:
{
- configure_native(a, b, c_to_use, output, alpha, beta, gemm_info);
+ configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
break;
}
- case GEMMType::RESHAPED_V1:
+ case CLGEMMKernelType::RESHAPED_V1:
{
- configure_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info);
+ configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
break;
}
- case GEMMType::RESHAPED_V2:
+ case CLGEMMKernelType::RESHAPED:
{
- configure_reshaped_v2(a, b, c_to_use, output, alpha, beta, gemm_info);
+ configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
break;
}
- case GEMMType::RESHAPED_ONLY_RHS:
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS:
{
- configure_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info);
+ configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
break;
}
default:
@@ -592,37 +548,36 @@
Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
// Get the GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
const unsigned int n = b->dimension(0);
const unsigned int k = a->dimension(0);
// Select GEMMType
- GEMMType gemm_type = select_gemm_type(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run(), gpu_target);
+ CLGEMMKernelType gemm_kernel_type = select_gemm_kernel(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run());
const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
- switch(gemm_type)
+ switch(gemm_kernel_type)
{
- case GEMMType::NATIVE:
+ case CLGEMMKernelType::NATIVE_V1:
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c_to_use, output, alpha, beta, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
break;
}
- case GEMMType::RESHAPED_V1:
+ case CLGEMMKernelType::RESHAPED_V1:
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
break;
}
- case GEMMType::RESHAPED_V2:
+ case CLGEMMKernelType::RESHAPED:
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c_to_use, output, alpha, beta, gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info));
break;
}
- case GEMMType::RESHAPED_ONLY_RHS:
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS:
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));
break;
@@ -643,14 +598,14 @@
MemoryGroupResourceScope scope_mg(_memory_group);
// Run matrix multiply kernel
- switch(_gemm_type)
+ switch(_gemm_kernel_type)
{
- case GEMMType::NATIVE:
+ case CLGEMMKernelType::NATIVE_V1:
{
CLScheduler::get().enqueue(_mm_kernel, true);
break;
}
- case GEMMType::RESHAPED_V1:
+ case CLGEMMKernelType::RESHAPED_V1:
{
// Run interleave kernel
CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
@@ -671,7 +626,7 @@
CLScheduler::get().enqueue(_mm_kernel, true);
break;
}
- case GEMMType::RESHAPED_V2:
+ case CLGEMMKernelType::RESHAPED:
{
// Run interleave kernel
CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
@@ -692,7 +647,7 @@
CLScheduler::get().enqueue(_mm_reshaped_kernel, true);
break;
}
- case GEMMType::RESHAPED_ONLY_RHS:
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS:
{
if(!_reshape_b_only_on_first_run)
{
@@ -721,7 +676,7 @@
{
if(!_is_prepared)
{
- if(_gemm_type != GEMMType::NATIVE && _reshape_b_only_on_first_run)
+ if(_gemm_kernel_type != CLGEMMKernelType::NATIVE_V1 && _reshape_b_only_on_first_run)
{
if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
{
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 682812b..1c37993 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -48,6 +48,11 @@
void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
{
+ configure(CLKernelLibrary::get().get_compile_context(), weights, biases, output, num_groups);
+}
+
+void CLConvolutionLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
+{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayerReshapeWeights::validate(weights->info(),
@@ -58,7 +63,7 @@
const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr;
- _weights_reshape_kernel.configure(weights, biases_to_use, output, num_groups);
+ _weights_reshape_kernel.configure(compile_context, weights, biases_to_use, output, num_groups);
output->info()->set_quantization_info(weights->info()->quantization_info());
}
@@ -100,7 +105,8 @@
{
}
-void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+ const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
int gemm_3d_depth, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
@@ -127,7 +133,7 @@
input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
- _mm_gemmlowp.configure(input, weights, biases, output, gemm_info);
+ _mm_gemmlowp.configure(compile_context, input, weights, biases, output, gemm_info);
// Revert back QuantizatioInfo as input and weights could be used in other convolution layers
input->info()->set_quantization_info(input_quantization_info);
@@ -136,7 +142,7 @@
else
{
// Configure matrix multiply function
- _mm_gemm.configure(input, weights, biases, output, 1.0f, 1.0f, gemm_info);
+ _mm_gemm.configure(compile_context, input, weights, biases, output, 1.0f, 1.0f, gemm_info);
}
}
@@ -181,6 +187,13 @@
void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
+}
+
+void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(CLGEMMConvolutionLayer::validate(input->info(),
@@ -252,24 +265,24 @@
if(_weights_manager && _weights_manager->are_weights_managed(weights))
{
- _reshape_weights_managed.configure(weights, biases, num_groups);
+ _reshape_weights_managed.configure(compile_context, weights, biases, num_groups);
weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
}
else
{
- _reshape_weights.configure(weights, biases, &_weights_reshaped, num_groups);
+ _reshape_weights.configure(compile_context, weights, biases, &_weights_reshaped, num_groups);
}
}
else
{
if(_weights_manager && _weights_manager->are_weights_managed(weights))
{
- _reshape_weights_managed.configure(weights, nullptr, num_groups);
+ _reshape_weights_managed.configure(compile_context, weights, nullptr, num_groups);
weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
}
else
{
- _reshape_weights.configure(weights, nullptr, &_weights_reshaped, num_groups);
+ _reshape_weights.configure(compile_context, weights, nullptr, &_weights_reshaped, num_groups);
}
}
@@ -279,7 +292,7 @@
_memory_group.manage(&_im2col_output);
// Configure and tune im2col. im2col output shape is auto-initialized
- _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
+ _im2col_kernel.configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
// Set quantization info
_im2col_output.info()->set_quantization_info(input->info()->quantization_info());
@@ -333,8 +346,12 @@
gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0];
- int min_activation = 0;
- int max_activation = 0;
+ PixelValue min_val{};
+ PixelValue max_val{};
+ std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
+
+ auto min_activation = min_val.get<int32_t>();
+ auto max_activation = max_val.get<int32_t>();
const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
@@ -363,7 +380,7 @@
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
- configure_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
+ configure_mm(compile_context, gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
if(!_skip_im2col)
{
@@ -373,7 +390,7 @@
if(!_skip_col2im)
{
// Configure and tune Col2Im
- _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
+ _col2im_kernel.configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
CLScheduler::get().tune_kernel_static(_col2im_kernel);
}
@@ -387,7 +404,7 @@
if(!_fuse_activation)
{
- _activationlayer_function.configure(output, nullptr, act_info);
+ _activationlayer_function.configure(compile_context, output, nullptr, act_info);
}
ARM_COMPUTE_UNUSED(weights_info);
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 14bda11..1dcb341 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -28,7 +28,6 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "utils/TypePrinter.h"
#include <memory>
#include <tuple>
@@ -62,6 +61,33 @@
return { start, end };
}
+Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info)
+{
+ const auto data_type = input->data_type();
+
+ if(is_data_type_quantized_asymmetric(data_type))
+ {
+ const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+
+ float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+ int output_multiplier(0);
+ int output_shift(0);
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
+ output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_stage_info.gemmlowp_multiplier = output_multiplier;
+ output_stage_info.gemmlowp_shift = output_shift;
+ output_stage_info.gemmlowp_offset = oq_info.offset;
+ const auto min_max_bound = get_min_max(data_type);
+ output_stage_info.gemmlowp_min_bound = (std::get<0>(min_max_bound)).get<int32_t>();
+ output_stage_info.gemmlowp_max_bound = (std::get<1>(min_max_bound)).get<int32_t>();
+ output_stage_info.output_data_type = data_type;
+ }
+ return Status{};
+}
+
} // namespace
CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
@@ -93,7 +119,7 @@
Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
@@ -141,10 +167,13 @@
TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
GEMMInfo gemm_info(false, false, true, input->dimension(idx_h), true);
+ GEMMLowpOutputStageInfo output_stage_info;
+
if(is_quantized)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info));
}
else
{
@@ -160,9 +189,8 @@
{
const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr,
- &col2im_output_info.clone()->set_is_resizable(true).set_data_type(DataType::QASYMM8)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(DataType::QASYMM8), output, start_end.first, start_end.second));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second));
}
else if(padded_input)
{
@@ -173,16 +201,7 @@
else if(is_quantized)
{
ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
-
- const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
-
- float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
- int output_multiplier(0);
- int output_shift(0);
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&col2im_output_info, nullptr, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
}
else
{
@@ -194,6 +213,12 @@
void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info);
+}
+
+void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+ const PadStrideInfo &deconv_info)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
weights->info(),
@@ -216,9 +241,9 @@
if(_is_nchw)
{
_memory_group.manage(&_permuted_input);
- _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+ _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U));
- _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+ _permute_weights_to_nhwc.configure(compile_context, weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
input_to_use = &_permuted_input;
weights_to_use = &_permuted_weights;
@@ -230,8 +255,8 @@
1,
input->info()->data_type(), weights->info()->quantization_info()));
- _reshape_weights.configure(weights_to_use, &_reshaped_weights);
- _transpose_weights.configure(&_reshaped_weights, &_reshaped_weights_t);
+ _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights);
+ _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t);
const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
GEMMInfo gemm_info(false, false, true, input->info()->dimension(idx_h), true);
@@ -247,14 +272,14 @@
input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
_reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
- _mm_gemmlowp.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
+ _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
input_to_use->info()->set_quantization_info(iq_info);
_reshaped_weights_t.info()->set_quantization_info(wq_info);
}
else
{
- _mm_gemm.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+ _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
}
if(_is_nchw)
@@ -292,20 +317,14 @@
}
// Configure a Col2Im call to reshape the output of GEMM
- _deconv_reshape.configure(&_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+ _deconv_reshape.configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
_gemm_output.allocator()->allocate();
if(_is_quantized)
{
- const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
- float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
- int output_multiplier(0);
- int output_shift(0);
- quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
- _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, oq_info.offset);
+ GEMMLowpOutputStageInfo output_stage_info;
+ construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info);
+ _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info);
_gemmlowp_final.allocator()->allocate();
}
@@ -313,7 +332,7 @@
if(_padded_input)
{
const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
- _slice_gemm.configure(&_slice_gemm_input, slice_output, start_end.first, start_end.second);
+ _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second);
_slice_gemm_input.allocator()->allocate();
}
}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index cdb78c2..84da4a7 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -28,12 +28,14 @@
#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
namespace arm_compute
{
@@ -42,16 +44,33 @@
namespace
{
-inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run)
{
- return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
+ std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get());
+
+ CLGEMMKernelSelectionParams params;
+ params.m = m;
+ params.n = n;
+ params.k = k;
+ params.is_rhs_constant = reshape_b_only_on_first_run;
+ params.data_type = data_type;
+
+ switch(gemm_kernel->select_kernel(params))
+ {
+ case CLGEMMKernelType::NATIVE:
+ return false;
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+ return true;
+ default:
+ ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
+ }
}
} // namespace
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_weights_to_qasymm8(),
- _mm_midgard_kernel(),
_mm_native_kernel(),
_mm_reshaped_only_rhs_kernel(),
_mtx_b_reshape_kernel(),
@@ -72,16 +91,21 @@
_a_offset(0),
_b_offset(0),
_is_gemm_reshaped(true),
- _is_midgard(false),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
- _fuse_output_stage(false),
- _convert_to_qasymm8(false)
+ _run_output_stage(false),
+ _convert_to_qasymm8(false),
+ _run_offset_contribution(false)
{
}
void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
+}
+
+void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
@@ -100,7 +124,6 @@
const GPUTarget gpu_target = CLScheduler::get().target();
// Set the target for the kernels
- _mm_midgard_kernel.set_target(gpu_target);
_mm_native_kernel.set_target(gpu_target);
_mm_reshaped_only_rhs_kernel.set_target(gpu_target);
@@ -118,8 +141,7 @@
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
// Check if we need to reshape the matrix A and matrix B
- _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
- _is_midgard = gpu_target == GPUTarget::MIDGARD;
+ _is_gemm_reshaped = is_gemm_reshaped(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run);
if(_convert_to_qasymm8)
{
@@ -127,7 +149,7 @@
TensorInfo weights_info(*b->info());
weights_info.set_data_type(DataType::QASYMM8);
_qasymm8_weights.allocator()->init(weights_info);
- _weights_to_qasymm8.configure(b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
+ _weights_to_qasymm8.configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
}
const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
@@ -141,12 +163,16 @@
}
// Pick up the GEMM configuration
+ // Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED doesn't matter, since it only affect the shape configuration
std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
// Configure reshape RHS kernel
- _mtx_b_reshape_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+ _mtx_b_reshape_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
}
+ // Using default reduction info
+ const GEMMLowpReductionKernelInfo reduction_info {};
+
// Initialize matrix B reduction kernel only if _a_offset is not equal to 0
if(_a_offset != 0)
{
@@ -158,7 +184,7 @@
}
// Configure Matrix B reduction kernel
- _mtx_b_reduction_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col);
+ _mtx_b_reduction_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
}
// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -169,37 +195,22 @@
_memory_group.manage(&_vector_sum_row);
// Configure matrix A reduction kernel
- _mtx_a_reduction_kernel.configure(a, &_vector_sum_row);
+ _mtx_a_reduction_kernel.configure(compile_context, a, &_vector_sum_row, reduction_info);
}
+ GEMMKernelInfo gemm_kernel_info;
+ gemm_kernel_info.m = m;
+ gemm_kernel_info.n = n;
+ gemm_kernel_info.k = k;
+ gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+ gemm_kernel_info.lhs_info = lhs_info;
+ gemm_kernel_info.rhs_info = rhs_info;
+ gemm_kernel_info.a_offset = _a_offset;
+ gemm_kernel_info.b_offset = _b_offset;
// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
{
- _fuse_output_stage = true;
-
- _memory_group.manage(&_mm_result_s32);
-
- if(_is_gemm_reshaped)
- {
- // Configure and tune matrix multiply kernel
- _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
- }
- else
- {
- if(_is_midgard)
- {
- // Configure matrix multiply kernel
- _mm_midgard_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
- }
- else
- {
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
-
- // Configure matrix multiply kernel
- _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
- }
- }
// Configure offset contribution kernel
const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
@@ -208,8 +219,39 @@
GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
gemmlowp_output_stage.output_data_type = _matrix_a->info()->data_type();
- _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
- _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+
+ gemm_kernel_info.output_stage = gemmlowp_output_stage;
+
+ if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ // Configure and tune matrix multiply kernel with fused output stage
+ _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+ }
+ else
+ {
+ _run_output_stage = true;
+
+ _memory_group.manage(&_mm_result_s32);
+
+ if(_is_gemm_reshaped)
+ {
+ _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Configure matrix multiply kernel
+ _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+
+ _offset_contribution_output_stage_kernel.configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
+ a->info()->dimension(0),
+ _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+ _mm_result_s32.allocator()->allocate();
+ }
+ }
_gemm_output_stage_multipliers.allocator()->allocate();
_gemm_output_stage_shifts.allocator()->allocate();
@@ -220,35 +262,27 @@
std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
_gemm_output_stage_multipliers.unmap();
_gemm_output_stage_shifts.unmap();
-
- _mm_result_s32.allocator()->allocate();
}
else
{
+ _run_offset_contribution = true;
if(_is_gemm_reshaped)
{
// Configure and tune matrix multiply kernel
- _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
}
else
{
- if(_is_midgard)
- {
- // Configure matrix multiply kernel
- _mm_midgard_kernel.configure(_matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
- }
- else
- {
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
- // Configure matrix multiply kernel
- _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
- }
+ // Configure matrix multiply kernel
+ _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
// Configure offset contribution kernel
- _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
+ _offset_contribution_kernel.configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
+ _b_offset);
}
// Allocate tensors
@@ -275,11 +309,9 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- //DataType::QSYMM8_PER_CHANNEL supported only for weights
- if(b->data_type() != DataType::QSYMM8_PER_CHANNEL)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
- }
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
@@ -301,9 +333,8 @@
const unsigned int k = a->dimension(0);
const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const bool is_midgard = gpu_target == GPUTarget::MIDGARD;
- bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
+ bool reshape_matrix_b = is_gemm_reshaped(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run());
const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
@@ -332,13 +363,14 @@
TensorInfo info_vector_sum_col{};
TensorInfo info_vector_sum_row{};
+ const GEMMLowpReductionKernelInfo reduction_info;
// Validate matrix B reduction kernel only if _a_offset is not equal to 0
if(a_offset != 0)
{
info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
// Configure Matrix B reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
}
// Validate Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -347,80 +379,88 @@
info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
// Configure matrix A reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
}
+ GEMMKernelInfo gemm_kernel_info;
+ gemm_kernel_info.m = m;
+ gemm_kernel_info.n = n;
+ gemm_kernel_info.k = k;
+ gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+ gemm_kernel_info.lhs_info = lhs_info;
+ gemm_kernel_info.rhs_info = rhs_info;
+ gemm_kernel_info.a_offset = a_offset;
+ gemm_kernel_info.b_offset = b_offset;
if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
{
- TensorInfo mm_result_s32_info{};
-
- if(reshape_matrix_b)
- {
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
- }
- else
- {
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
-
- if(is_midgard)
- {
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_info));
- }
- else
- {
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
- }
- }
-
- // Validate offset contribution kernel
const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
gemmlowp_output_stage.output_data_type = a->data_type();
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- c,
- output,
- a_offset, b_offset,
- gemmlowp_output_stage,
- &gemm_output_stage_multipliers_shifts_info,
- &gemm_output_stage_multipliers_shifts_info));
+
+ gemm_kernel_info.output_stage = gemmlowp_output_stage;
+ if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c,
+ &gemm_output_stage_multipliers_shifts_info,
+ &gemm_output_stage_multipliers_shifts_info));
+ }
+ else
+ {
+ TensorInfo mm_result_s32_info{};
+
+ if(reshape_matrix_b)
+ {
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
+ }
+ else
+ {
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
+
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+ }
+
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c,
+ output,
+ a_offset, b_offset,
+ gemmlowp_output_stage,
+ &gemm_output_stage_multipliers_shifts_info,
+ &gemm_output_stage_multipliers_shifts_info));
+ }
}
else
{
if(reshape_matrix_b)
{
// Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
}
else
{
- if(is_midgard)
- {
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_info));
- }
- else
- {
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
- }
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
}
if(output->total_size() != 0)
@@ -458,6 +498,12 @@
CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
}
+ // Run matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
+ }
+
// Run matrix multiply
if(_is_gemm_reshaped)
{
@@ -465,28 +511,14 @@
}
else
{
- if(_is_midgard)
- {
- CLScheduler::get().enqueue(_mm_midgard_kernel, false);
- }
- else
- {
- CLScheduler::get().enqueue(_mm_native_kernel, false);
- }
+ CLScheduler::get().enqueue(_mm_native_kernel, false);
}
-
- // Run matrix A reduction kernel only if _b_offset is not equal to 0
- if(_b_offset != 0)
- {
- CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
- }
-
- if(_fuse_output_stage)
+ if(_run_output_stage)
{
// Run offset contribution/output stage kernel
CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true);
}
- else
+ if(_run_offset_contribution)
{
// Run offset contribution kernel
CLScheduler::get().enqueue(_offset_contribution_kernel, true);
@@ -524,4 +556,4 @@
_is_prepared = true;
}
}
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 9551fc7..9ae5d51 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,33 +24,67 @@
#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
{
- auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
- k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
+ GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
+ info.gemmlowp_offset = result_offset;
+ info.gemmlowp_multiplier = result_mult_int;
+ info.gemmlowp_shift = result_shift;
+ info.gemmlowp_min_bound = min;
+ info.gemmlowp_max_bound = max;
+
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
+ k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info);
+ _kernel = std::move(k);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset,
+ int result_mult_int,
+ int result_shift, int min, int max)
+{
+ GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
+ info.gemmlowp_offset = result_offset;
+ info.gemmlowp_multiplier = result_mult_int;
+ info.gemmlowp_shift = result_shift;
+ info.gemmlowp_min_bound = min;
+ info.gemmlowp_max_bound = max;
+
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
+ k->configure(compile_context, input, bias, output, &info);
_kernel = std::move(k);
}
Status CLGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
{
- return CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(input, bias, output, min, max);
+ GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
+ info.gemmlowp_min_bound = min;
+ info.gemmlowp_max_bound = max;
+
+ return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
}
void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
int min, int max)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
+ int min, int max)
+{
auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
- k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+ k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
_kernel = std::move(k);
}
@@ -65,7 +99,16 @@
int min, int max)
{
auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
- k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+ k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+ _kernel = std::move(k);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
+ int min, int max)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
+ k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
_kernel = std::move(k);
}
@@ -79,23 +122,54 @@
float multiplier, int offset,
int min, int max)
{
- auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel>();
- k->configure(input, bias, output, multiplier, offset, min, max);
+ GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
+ info.gemmlowp_offset = offset;
+ info.gemmlowp_real_multiplier = multiplier;
+ info.gemmlowp_min_bound = min;
+ info.gemmlowp_max_bound = max;
+
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
+ k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info);
+ _kernel = std::move(k);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+ float multiplier, int offset,
+ int min, int max)
+{
+ GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
+ info.gemmlowp_offset = offset;
+ info.gemmlowp_real_multiplier = multiplier;
+ info.gemmlowp_min_bound = min;
+ info.gemmlowp_max_bound = max;
+
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
+ k->configure(compile_context, input, bias, output, &info);
_kernel = std::move(k);
}
Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
int min, int max)
{
- return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(input, bias, output, min, max);
+ GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
+ info.gemmlowp_min_bound = min;
+ info.gemmlowp_max_bound = max;
+ return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info);
}
void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
int result_fixedpoint_multiplier, int result_shift,
int min, int max)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+ int result_fixedpoint_multiplier, int result_shift,
+ int min, int max)
+{
auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
- k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
+ k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
_kernel = std::move(k);
}
@@ -105,4 +179,93 @@
return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max);
}
+void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
+}
+
+void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ switch(info.type)
+ {
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+ {
+ switch(info.output_data_type)
+ {
+ case DataType::QASYMM8:
+ {
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
+ k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ _kernel = std::move(k);
+ break;
+ }
+ case DataType::QASYMM8_SIGNED:
+ {
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
+ k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ _kernel = std::move(k);
+ break;
+ }
+ case DataType::QSYMM16:
+ {
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
+ k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ _kernel = std::move(k);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported output data type.");
+ }
+ break;
+ }
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+ {
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
+ k->configure(compile_context, input, bias, output, &info);
+ _kernel = std::move(k);
+ break;
+ }
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
+ {
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
+ k->configure(compile_context, input, bias, output, &info);
+ _kernel = std::move(k);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
+ }
+}
+
+Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
+
+ switch(info.type)
+ {
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+ {
+ switch(output->data_type())
+ {
+ case DataType::QASYMM8:
+ return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ case DataType::QASYMM8_SIGNED:
+ return CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ case DataType::QSYMM16:
+ return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ default:
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
+ }
+ }
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+ return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
+ return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info);
+ default:
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
+ }
+}
} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index 459438e..e2b18e0 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,14 +25,19 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
+}
+
+void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+{
auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
- k->configure(input, indices, output, axis);
+ k->configure(compile_context, input, indices, output, axis);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
index 7ebabd7..47367c4 100644
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
#include "arm_compute/core/PixelValue.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,13 @@
void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLGaussian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLGaussian3x3Kernel>();
- k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index ea803e4..6b82cd0 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,6 +42,11 @@
void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
_tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16));
@@ -50,9 +55,9 @@
_memory_group.manage(&_tmp);
// Configure kernels
- _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
- _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
- _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+ _kernel_hor.configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED);
+ _kernel_vert.configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED);
+ _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
// Allocate intermediate buffers
_tmp.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index b671b23..1ac9878 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,7 +31,6 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
#include "arm_compute/runtime/CL/CLPyramid.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
@@ -58,6 +57,11 @@
void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
+}
+
+void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON(pyramid == nullptr);
ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
@@ -91,16 +95,16 @@
for(size_t i = 0; i < num_levels - 1; ++i)
{
/* Configure horizontal kernel */
- _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
+ _horizontal_reduction[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
/* Configure vertical kernel */
- _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
+ _vertical_reduction[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
/* Configure border */
- _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+ _horizontal_border_handler[i].configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
/* Configure border */
- _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
+ _vertical_border_handler[i].configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
}
_tmp.allocate();
}
@@ -138,6 +142,11 @@
void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
+}
+
+void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
@@ -163,10 +172,10 @@
for(size_t i = 0; i < num_levels - 1; ++i)
{
/* Configure gaussian 5x5 */
- _gauss5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+ _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
/* Configure scale image kernel */
- _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, SamplingPolicy::CENTER);
+ _scale_nearest[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, SamplingPolicy::CENTER);
}
_tmp.allocate();
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index c9eb8ab..7f037fc 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,6 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Types.h"
-#include "support/ToolchainSupport.h"
namespace arm_compute
{
@@ -65,6 +64,13 @@
void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
const GenerateProposalsInfo &info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
+}
+
+void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals,
+ ICLTensor *scores_out,
+ ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
@@ -85,7 +91,7 @@
// Compute all the anchors
_memory_group.manage(&_all_anchors);
- _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+ _compute_anchors_kernel.configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
_deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
@@ -95,13 +101,13 @@
if(!_is_nhwc)
{
_memory_group.manage(&_deltas_permuted);
- _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
- _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+ _permute_deltas_kernel.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_deltas_kernel.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
_deltas_permuted.allocator()->allocate();
}
else
{
- _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+ _flatten_deltas_kernel.configure(compile_context, deltas, &_deltas_flattened);
}
const TensorShape flatten_shape_scores(1, total_num_anchors);
@@ -112,13 +118,13 @@
if(!_is_nhwc)
{
_memory_group.manage(&_scores_permuted);
- _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
- _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+ _permute_scores_kernel.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_scores_kernel.configure(compile_context, &_scores_permuted, &_scores_flattened);
_scores_permuted.allocator()->allocate();
}
else
{
- _flatten_scores_kernel.configure(scores, &_scores_flattened);
+ _flatten_scores_kernel.configure(compile_context, scores, &_scores_flattened);
}
CLTensor *anchors_to_use = &_all_anchors;
@@ -130,18 +136,18 @@
_memory_group.manage(&_all_anchors_f32);
_memory_group.manage(&_deltas_flattened_f32);
// Dequantize anchors to float
- _dequantize_anchors.configure(&_all_anchors, &_all_anchors_f32);
+ _dequantize_anchors.configure(compile_context, &_all_anchors, &_all_anchors_f32);
_all_anchors.allocator()->allocate();
anchors_to_use = &_all_anchors_f32;
// Dequantize deltas to float
- _dequantize_deltas.configure(&_deltas_flattened, &_deltas_flattened_f32);
+ _dequantize_deltas.configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32);
_deltas_flattened.allocator()->allocate();
deltas_to_use = &_deltas_flattened_f32;
}
// Bounding box transform
_memory_group.manage(&_all_proposals);
BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
- _bounding_box_kernel.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+ _bounding_box_kernel.configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
deltas_to_use->allocator()->allocate();
anchors_to_use->allocator()->allocate();
@@ -151,7 +157,7 @@
_memory_group.manage(&_all_proposals_quantized);
// Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
_all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
- _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
+ _quantize_all_proposals.configure(compile_context, &_all_proposals, &_all_proposals_quantized);
_all_proposals.allocator()->allocate();
_all_proposals_to_use = &_all_proposals_quantized;
}
@@ -186,7 +192,7 @@
_scores_flattened.allocator()->allocate();
// Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
- _pad_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+ _pad_kernel.configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
_proposals_4_roi_values.allocator()->allocate();
}
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index 0931443..0645cfd 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,6 +38,11 @@
void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, hog, border_mode, constant_border_value);
+}
+
+void CLHOGDescriptor::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON(nullptr == output);
ARM_COMPUTE_ERROR_ON(nullptr == hog);
@@ -76,16 +81,16 @@
_memory_group.manage(&_phase);
// Initialise gradient kernel
- _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+ _gradient.configure(compile_context, input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
// Manage intermediate buffers
_memory_group.manage(&_hog_space);
// Initialise orientation binning kernel
- _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+ _orient_bin.configure(compile_context, &_mag, &_phase, &_hog_space, hog->info());
// Initialize HOG norm kernel
- _block_norm.configure(&_hog_space, output, hog->info());
+ _block_norm.configure(compile_context, &_hog_space, output, hog->info());
// Allocate intermediate tensors
_mag.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
index 8eb5e42..bf9bae1 100644
--- a/src/runtime/CL/functions/CLHOGDetector.cpp
+++ b/src/runtime/CL/functions/CLHOGDetector.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,13 +38,19 @@
void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, detection_window_stride, threshold, idx_class);
+}
+
+void CLHOGDetector::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride,
+ float threshold, size_t idx_class)
+{
_detection_windows = detection_windows;
// Allocate buffer for storing the number of detected objects
_num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
// Configure HOGDetectorKernel
- _hog_detector_kernel.configure(input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
+ _hog_detector_kernel.configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
}
void CLHOGDetector::run()
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index e509fd8..acf5f2c 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,6 +36,12 @@
void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output_magnitude, output_phase, phase_type, border_mode, constant_border_value);
+}
+
+void CLHOGGradient::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode,
+ uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
@@ -52,16 +58,16 @@
_memory_group.manage(&_gy);
// Initialise derivate kernel
- _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
+ _derivative.configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
// Initialise magnitude/phase kernel
if(PhaseType::UNSIGNED == phase_type)
{
- _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
+ _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
}
else
{
- _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
+ _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
}
// Allocate intermediate tensors
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index f799d61..248f730 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,6 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/Scheduler.h"
-#include "support/ToolchainSupport.h"
using namespace arm_compute;
@@ -56,6 +55,14 @@
void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, multi_hog, detection_windows, detection_window_strides, border_mode, constant_border_value, threshold, non_maxima_suppression,
+ min_distance);
+}
+
+void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows,
+ ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+ uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
+{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
@@ -146,7 +153,7 @@
_memory_group.manage(&_phase);
// Initialise gradient kernel
- _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
+ _gradient_kernel.configure(compile_context, input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
// Configure NETensor for the HOG space and orientation binning kernel
for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
@@ -174,7 +181,7 @@
_memory_group.manage(&_hog_space[i]);
// Initialise orientation binning kernel
- _orient_bin_kernel[i].configure(&_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
+ _orient_bin_kernel[i].configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
}
// Allocate intermediate tensors
@@ -195,7 +202,7 @@
_memory_group.manage(&_hog_norm_space[i]);
// Initialize block normalization kernel
- _block_norm_kernel[i].configure(&_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
+ _block_norm_kernel[i].configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
}
// Allocate intermediate tensors
@@ -211,7 +218,7 @@
{
const size_t idx_block_norm = input_hog_detect[i];
- _hog_detect_kernel[i].configure(&_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+ _hog_detect_kernel[i].configure(compile_context, &_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
}
detection_window_strides->unmap(CLScheduler::get().queue());
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 67f550d3..aecec0d 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,7 +35,7 @@
#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
#include "arm_compute/runtime/ITensorAllocator.h"
#include "arm_compute/runtime/Scheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <cmath>
#include <utility>
@@ -65,6 +65,13 @@
float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, threshold, min_dist, sensitivity, gradient_size, block_size, corners, border_mode, constant_border_value, use_fp16);
+}
+
+void CLHarrisCorners::configure(const CLCompileContext &compile_context, ICLImage *input, float threshold, float min_dist,
+ float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
+ BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+{
ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-772): Add half float support
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -96,21 +103,21 @@
case 3:
{
auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
- k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+ k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
}
case 5:
{
auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
- k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+ k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
}
case 7:
{
auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
- k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+ k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
_sobel = std::move(k);
break;
}
@@ -126,11 +133,11 @@
_memory_group.manage(&_score);
// Set/init Harris Score kernel accordingly with block_size
- _harris_score.configure(&_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+ _harris_score.configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
// Configure border filling using harris score kernel's block size
- _border_gx.configure(&_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
- _border_gy.configure(&_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
+ _border_gx.configure(compile_context, &_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
+ _border_gy.configure(compile_context, &_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
// Allocate intermediate buffers
_gx.allocator()->allocate();
@@ -140,7 +147,7 @@
_memory_group.manage(&_nonmax);
// Init non-maxima suppression function
- _non_max_suppr.configure(&_score, &_nonmax, border_mode);
+ _non_max_suppr.configure(compile_context, &_score, &_nonmax, border_mode);
// Allocate intermediate buffers
_score.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp
index eb54338..e723024 100644
--- a/src/runtime/CL/functions/CLHistogram.cpp
+++ b/src/runtime/CL/functions/CLHistogram.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,8 +34,13 @@
void CLHistogram::configure(const ICLImage *input, ICLDistribution1D *output)
{
- _kernel.configure(input, output);
- _kernel_border.configure(input, output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output)
+{
+ _kernel.configure(compile_context, input, output);
+ _kernel_border.configure(compile_context, input, output);
}
void CLHistogram::run()
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index e639e74..273a873 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -34,8 +34,13 @@
void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision);
+}
+
+void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+{
auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernel>();
- k->configure(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+ k->configure(compile_context, input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp
index 2d54be3..b3be2f8 100644
--- a/src/runtime/CL/functions/CLIntegralImage.cpp
+++ b/src/runtime/CL/functions/CLIntegralImage.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,8 +35,13 @@
void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
{
- _integral_hor.configure(input, output);
- _integral_vert.configure(output);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLIntegralImage::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
+ _integral_hor.configure(compile_context, input, output);
+ _integral_vert.configure(compile_context, output);
}
void CLIntegralImage::run()
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 7d1c818..14c83cd 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,6 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
namespace arm_compute
{
@@ -46,6 +45,11 @@
void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon);
+}
+
+void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
+{
// Reset auxiliary tensor
_sumsq.allocator()->init(TensorInfo());
@@ -54,8 +58,8 @@
// Configure kernels
const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
- _reduce_func.configure(input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
- _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+ _reduce_func.configure(compile_context, input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
+ _normalize_kernel.configure(compile_context, input, &_sumsq, output, axis, epsilon);
// Allocate intermediate tensor
_sumsq.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 793d5ca..56f22e2 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,19 +23,17 @@
*/
#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
-#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::utils::info_helpers;
CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
@@ -61,6 +59,19 @@
ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+ cell_threshold, projection_threshold);
+}
+
+void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+ const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
+ ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
+ const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input,
input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
@@ -71,22 +82,8 @@
_is_layer_norm_lstm = lstm_params.use_layer_norm();
// Set lstm parameters
- LSTMParams<ITensorInfo> lstm_params_info;
- if(lstm_params.has_peephole_opt())
- {
- lstm_params_info.set_peephole_params(lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
- }
- if(lstm_params.has_projection())
- {
- lstm_params_info.set_projection_params(lstm_params.projection_weights()->info(),
- lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
- }
- if(!lstm_params.has_cifg_opt())
- {
- const ITensorInfo *cell_to_input_weights_info = (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
- lstm_params_info.set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
- cell_to_input_weights_info, lstm_params.input_gate_bias()->info());
- }
+ LSTMParams<ITensorInfo> lstm_params_info{};
+ build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
// Validate
ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
@@ -113,7 +110,7 @@
_forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type()));
_memory_group.manage(&_forget_gate_out2);
- _concat_inputs_forget_gate.configure(input, output_state_in, &_forget_gate_out2);
+ _concat_inputs_forget_gate.configure(compile_context, input, output_state_in, &_forget_gate_out2);
std::vector<const ICLTensor *> weights_vector;
@@ -122,10 +119,10 @@
const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
_forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
- _concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
+ _concat_weights_forget_gate.configure(compile_context, input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
_memory_group.manage(&_forget_gate_out5);
- _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+ _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
_memory_group.manage(&_forget_gate_out1);
_memory_group.manage(&_forget_gate_out3);
_forget_gate_out6.allocator()->allocate();
@@ -137,8 +134,8 @@
_run_peephole_opt = true;
_memory_group.manage(&_forget_gate_out4);
- _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+ _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
_forget_gate_out4.allocator()->allocate();
_forget_gate_out5.allocator()->allocate();
forget_gate_out = &_forget_gate_out3;
@@ -153,15 +150,16 @@
_forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_forget_layer_norm_out1);
_memory_group.manage(&_forget_layer_norm_out2);
- _mean_std_norm_forget_gate.configure(forget_gate_out);
- _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out);
+ _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN);
// forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
forget_gate_out->allocator()->allocate();
- _accum_forget_gate_bias.configure(ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_forget_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
_forget_layer_norm_out1.allocator()->allocate();
forget_gate_out = &_forget_layer_norm_out2;
}
- _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -174,8 +172,8 @@
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _ones_memset_kernel.configure(&_ones, PixelValue(1, _ones.info()->data_type()));
- _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _ones_memset_kernel.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
+ _subtract_input_gate.configure(compile_context, ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
_ones.allocator()->allocate();
_run_cifg_opt = true;
}
@@ -190,20 +188,20 @@
TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
_input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
- _concat_weights_input_gate.configure(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+ _concat_weights_input_gate.configure(compile_context, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
_memory_group.manage(&_input_gate_out1);
_memory_group.manage(&_input_gate_out3);
- _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+ _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
_input_gate_out2.allocator()->allocate();
input_gate_out = &_input_gate_out3;
if(_run_peephole_opt)
{
_memory_group.manage(&_input_gate_out4);
- _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
_input_gate_out3.allocator()->allocate();
_input_gate_out4.allocator()->allocate();
input_gate_out = &_input_gate_out1;
@@ -219,15 +217,16 @@
_input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_input_layer_norm_out1);
_memory_group.manage(&_input_layer_norm_out2);
- _mean_std_norm_input_gate.configure(input_gate_out);
- _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _mean_std_norm_input_gate.configure(compile_context, input_gate_out);
+ _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN);
// input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
input_gate_out->allocator()->allocate();
- _accum_input_gate_bias.configure(ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_input_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
_input_layer_norm_out1.allocator()->allocate();
input_gate_out = &_input_layer_norm_out2;
}
- _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
}
// Configure block that calculates the cell state
@@ -240,14 +239,14 @@
_cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_state_out1);
- _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+ _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
_memory_group.manage(&_cell_state_out2);
- _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
+ _transpose_cell_state.configure(compile_context, recurrent_to_cell_weights, &_cell_state_out2);
_memory_group.manage(&_cell_state_out3);
- _gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+ _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
_cell_state_out2.allocator()->allocate();
_memory_group.manage(&_cell_state_out4);
- _accum_cell_state1.configure(ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+ _accum_cell_state1.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
CLTensor *cell_state_out_ptr = &_cell_state_out4;
if(_is_layer_norm_lstm)
{
@@ -255,27 +254,28 @@
_cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_layer_norm_out1);
_memory_group.manage(&_cell_layer_norm_out2);
- _mean_std_norm_cell_gate.configure(cell_state_out_ptr);
- _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr);
+ _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN);
// cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
cell_state_out_ptr->allocator()->allocate();
- _accum_cell_gate_bias.configure(ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_cell_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
_cell_layer_norm_out1.allocator()->allocate();
cell_state_out_ptr = &_cell_layer_norm_out2;
}
- _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info);
+ _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
- _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
cell_state_out_ptr->allocator()->allocate();
- _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_cell_state2.configure(ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_cell_state2.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
_cell_state_out3.allocator()->allocate();
_cell_state_out5.allocator()->allocate();
// Perform clipping
if(cell_threshold != 0.f)
{
_perform_cell_clipping = true;
- _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+ _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
}
// Configure block that calculates the output
@@ -290,12 +290,12 @@
TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
_output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
- _concat_weights_output.configure(input_to_output_weights, recurrent_to_output_weights, &_output2);
+ _concat_weights_output.configure(compile_context, input_to_output_weights, recurrent_to_output_weights, &_output2);
_memory_group.manage(&_output1);
_memory_group.manage(&_output4);
- _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+ _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
_output2.allocator()->allocate();
_forget_gate_out2.allocator()->allocate();
@@ -306,8 +306,8 @@
_output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
_memory_group.manage(&_output3);
- _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
_output4.allocator()->allocate();
output_gate_out = &_output1;
@@ -324,15 +324,16 @@
_output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_output_layer_norm_out1);
_memory_group.manage(&_output_layer_norm_out2);
- _mean_std_norm_output_gate.configure(output_gate_out);
- _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _mean_std_norm_output_gate.configure(compile_context, output_gate_out);
+ _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_NEAREST_EVEN);
// output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
output_gate_out->allocator()->allocate();
- _accum_output_gate_bias.configure(ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_output_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
_output_layer_norm_out1.allocator()->allocate();
output_gate_out = &_output_layer_norm_out2;
}
- _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the output state
/** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -348,26 +349,26 @@
_output_state1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_state_activation);
- _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
- _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info);
+ _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
_cell_state_activation.allocator()->allocate();
if(lstm_params.has_projection())
{
_has_projection_weights = true;
- _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+ _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
_output_state1.allocator()->allocate();
// Perform clipping
if(projection_threshold != 0.f)
{
_perform_projection_clipping = true;
- _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+ _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
}
}
// Copy cell state and output
- _copy_cell_state.configure(&_cell_state_out1, cell_state_out);
- _copy_output.configure(output_state_out, output);
+ _copy_cell_state.configure(compile_context, &_cell_state_out1, cell_state_out);
+ _copy_output.configure(compile_context, output_state_out, output);
// Vector for holding the tensors to store in scratch buffer
std::vector<ICLTensor *> scratch_inputs;
@@ -378,7 +379,7 @@
scratch_inputs.emplace_back(&_cell_state_out1);
scratch_inputs.emplace_back(forget_gate_out);
scratch_inputs.emplace_back(output_gate_out);
- _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer, Window::DimX);
+ _concat_scratch_buffer.configure(compile_context, scratch_inputs, scratch_buffer, Window::DimX);
input_gate_out->allocator()->allocate();
_cell_state_out1.allocator()->allocate();
forget_gate_out->allocator()->allocate();
@@ -443,7 +444,7 @@
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_layer_norm_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights()->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights()->dimension(0) != num_batches);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights()->dimension(0) != num_cells);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
}
@@ -452,9 +453,9 @@
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->dimension(0) != num_batches);
- ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->dimension(0) != num_batches);
- ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->dimension(0) != num_batches);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->dimension(0) != num_cells);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->dimension(0) != num_cells);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->dimension(0) != num_cells);
}
// Check peephole optimization
@@ -729,3 +730,4 @@
_is_prepared = true;
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index e5f1278..c57fcc9 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,6 +62,18 @@
ICLTensor *cell_state_in, const ICLTensor *output_state_in,
ICLTensor *cell_state_out, ICLTensor *output_state_out)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+ output_state_out);
+}
+
+void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input,
+ const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+ ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+ ICLTensor *cell_state_out, ICLTensor *output_state_out)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
@@ -107,18 +119,18 @@
recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
_input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
- _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
+ _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY);
_recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
- _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
+ _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY);
std::vector<const ICLTensor *> weights_vector;
weights_vector.emplace_back(&_recurrent_weights);
weights_vector.emplace_back(&_input_weights);
_weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
- _concat_weights.configure(weights_vector, &_weights, Window::DimX);
- _transpose_weights.configure(&_weights, &_weights_transposed);
+ _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX);
+ _transpose_weights.configure(compile_context, &_weights, &_weights_transposed);
// Input concatenation
std::vector<const ICLTensor *> input_vector;
@@ -127,7 +139,7 @@
_memory_group.manage(&_input);
_input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
- _concat_inputs.configure(input_vector, &_input, Window::DimX);
+ _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX);
// Bias concatenation
std::vector<const ICLTensor *> bias_vector;
@@ -137,7 +149,7 @@
bias_vector.emplace_back(output_gate_bias);
_bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
- _concat_bias.configure(bias_vector, &_bias, Window::DimX);
+ _concat_bias.configure(compile_context, bias_vector, &_bias, Window::DimX);
// Invert the offset for gemmlowp
_input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
@@ -146,7 +158,7 @@
// Run gemmlowp
_memory_group.manage(&_output_highp);
_output_highp.allocator()->init(TensorInfo(TensorShape(4 * output_size, batch_size), 1, DataType::S32));
- _gemmlowp.configure(&_input, &_weights_transposed, nullptr, &_output_highp);
+ _gemmlowp.configure(compile_context, &_input, &_weights_transposed, nullptr, &_output_highp);
_input.allocator()->allocate();
// Set the offset back
@@ -162,7 +174,7 @@
quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
_memory_group.manage(&_output_lowp);
- _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+ _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
_output_highp.allocator()->allocate();
_bias.allocator()->allocate();
@@ -170,86 +182,86 @@
if(batch_size > 1)
{
_memory_group.manage(&_input_gate_input);
- _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+ _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
_memory_group.manage(&_forget_gate_input);
- _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+ _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
_memory_group.manage(&_input_modulation_gate_input);
- _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+ _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
_memory_group.manage(&_output_gate_input);
- _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+ _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
_output_lowp.allocator()->allocate();
}
else
{
_memory_group.manage(&_input_gate_input);
- _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size });
+ _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size });
_memory_group.manage(&_forget_gate_input);
- _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+ _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
_memory_group.manage(&_input_modulation_gate_input);
- _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+ _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
_memory_group.manage(&_output_gate_input);
- _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+ _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
_output_lowp.allocator()->allocate();
}
// Forget gate
_memory_group.manage(&_forget_gate_output);
_forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_forget_gate_input.allocator()->allocate();
// Input gate
_memory_group.manage(&_input_gate_output);
_input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_input_gate_input.allocator()->allocate();
// Input modulation gate equation
_memory_group.manage(&_input_modulation_gate_output);
_input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+ _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
_input_modulation_gate_input.allocator()->allocate();
// Output gate
_memory_group.manage(&_output_gate_output);
_output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_output_gate_input.allocator()->allocate();
// Long term memory
_memory_group.manage(&_cell_state_tmp1);
_cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
- _mul_forget_gate_cell_state.configure(&_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_forget_gate_output.allocator()->allocate();
_memory_group.manage(&_cell_state_tmp2);
_cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
- _mul_input_gate_input_mod_gate.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_input_modulation_gate_output.allocator()->allocate();
_input_gate_output.allocator()->allocate();
- _add_cell_state_tmps.configure(&_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+ _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
_cell_state_tmp1.allocator()->allocate();
_cell_state_tmp2.allocator()->allocate();
// Short term memory
_memory_group.manage(&_output_state_tmp);
_output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+ _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
_memory_group.manage(&_output_state_out_symm);
_output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _mul_output_state_tmp_output_gate.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_output_gate_output.allocator()->allocate();
_output_state_tmp.allocator()->allocate();
// Requantize the output state from QSYMM16 to QASYMM8
_memory_group.manage(&_output_state_out_f32);
_output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
- _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
+ _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32);
_output_state_out_symm.allocator()->allocate();
- _quantize.configure(&_output_state_out_f32, output_state_out);
+ _quantize.configure(compile_context, &_output_state_out_f32, output_state_out);
_output_state_out_f32.allocator()->allocate();
}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index a118518..831f0cd 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,7 +32,6 @@
#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
-#include "support/ToolchainSupport.h"
using namespace arm_compute;
@@ -49,6 +48,11 @@
void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, output, border_mode, constant_border_value);
+}
+
+void CLLaplacianPyramid::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
@@ -68,18 +72,18 @@
_conv_pyr.init(pyramid_info);
// Create Gaussian Pyramid function
- _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
+ _gaussian_pyr_function.configure(compile_context, input, &_gauss_pyr, border_mode, constant_border_value);
_convf.resize(_num_levels);
_subf.resize(_num_levels);
for(unsigned int i = 0; i < _num_levels; ++i)
{
- _convf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
- _subf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
+ _convf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
+ _subf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
}
- _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
+ _depth_function.configure(compile_context, _conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
_gauss_pyr.allocate();
_conv_pyr.allocate();
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index 13116bf..ea6a3f9 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,6 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "support/ToolchainSupport.h"
#include <cstddef>
@@ -44,6 +43,11 @@
void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), pyramid, input, output, border_mode, constant_border_value);
+}
+
+void CLLaplacianReconstruct::configure(const CLCompileContext &compile_context, const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
ARM_COMPUTE_ERROR_ON(input == output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
@@ -68,17 +72,17 @@
const size_t last_level = num_levels - 1;
- _addf[last_level].configure(input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
+ _addf[last_level].configure(compile_context, input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
// Scale levels n-1 to 1, and add levels n-2 to 0
for(size_t l = 0; l < last_level; ++l)
{
- _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
- _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
+ _scalef[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
+ _addf[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
}
// Convert level 0 from S16 to U8
- _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
+ _depthf.configure(compile_context, _tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
_tmp_pyr.allocate();
}
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 3e99dde..950be50 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -129,6 +129,12 @@
void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info);
+}
+
+void CLLocallyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+ const PadStrideInfo &conv_info)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(CLLocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
@@ -160,10 +166,10 @@
_memory_group.manage(&_gemm_output);
// Configure kernels
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
- _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
- _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
- _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
+ _input_im2col_kernel.configure(compile_context, input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+ _weights_reshape_kernel.configure(compile_context, weights, biases, &_weights_reshaped);
+ _mm_kernel.configure(compile_context, &_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+ _output_col2im_kernel.configure(compile_context, &_gemm_output, output, Size2D(conv_w, conv_h));
// Allocate intermediate tensors
_input_im2col_reshaped.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index e2dfe3a..a267952 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,7 +32,12 @@
void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, mag_type);
+}
+
+void CLMagnitude::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
+{
auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
- k->configure(input1, input2, output, nullptr, mag_type);
+ k->configure(compile_context, input1, input2, output, nullptr, mag_type);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 8517b59..e3ce704 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,6 +66,11 @@
void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, mean, stddev);
+}
+
+void CLMeanStdDev::configure(const CLCompileContext &compile_context, ICLImage *input, float *mean, float *stddev)
+{
// In the case of F16/F32 we call reduction operation for calculating CLMeanStdDev
_data_type = input->info()->data_type();
@@ -74,14 +79,14 @@
_num_pixels = input->info()->dimension(0) * input->info()->dimension(1);
_memory_group.manage(&_reduction_output_mean);
- _reduction_operation_mean.configure(input, &_reduction_output_mean, 0, ReductionOperation::SUM);
+ _reduction_operation_mean.configure(compile_context, input, &_reduction_output_mean, 0, ReductionOperation::SUM);
_reduction_output_mean.allocator()->allocate();
_mean = mean;
if(stddev != nullptr)
{
_memory_group.manage(&_reduction_output_stddev);
- _reduction_operation_stddev.configure(input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE);
+ _reduction_operation_stddev.configure(compile_context, input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE);
_reduction_output_stddev.allocator()->allocate();
_stddev = stddev;
_run_stddev = true;
@@ -96,8 +101,8 @@
_global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
}
- _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
- _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
+ _mean_stddev_kernel.configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared);
+ _fill_border_kernel.configure(compile_context, input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
}
}
diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index 802a2fc..3dbab76 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,14 +25,19 @@
#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
#include "arm_compute/core/Types.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float epsilon)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
+}
+
+void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+{
auto k = arm_compute::support::cpp14::make_unique<CLMeanStdDevNormalizationKernel>();
- k->configure(input, output, epsilon);
+ k->configure(compile_context, input, output, epsilon);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
index 55f9eaa..dc53240 100644
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
#include "arm_compute/core/PixelValue.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,13 @@
void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLMedian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLMedian3x3Kernel>();
- k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
index 49dcbcb..15b2833 100644
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,6 +43,13 @@
void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, min, max, min_loc, max_loc, min_count, max_count);
+}
+
+void CLMinMaxLocation::configure(const CLCompileContext &compile_context, const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc,
+ uint32_t *min_count,
+ uint32_t *max_count)
+{
ARM_COMPUTE_ERROR_ON(nullptr == min);
ARM_COMPUTE_ERROR_ON(nullptr == max);
@@ -55,8 +62,8 @@
_min_loc = min_loc;
_max_loc = max_loc;
- _min_max_kernel.configure(input, &_min_max_vals);
- _min_max_loc_kernel.configure(input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
+ _min_max_kernel.configure(compile_context, input, &_min_max_vals);
+ _min_max_loc_kernel.configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
}
void CLMinMaxLocation::run()
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
index d37412f..96912a2 100644
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,14 @@
void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_mode, constant_border_value);
+}
+
+void CLNonLinearFilter::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern,
+ const uint8_t *mask, BorderMode border_mode, uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLNonLinearFilterKernel>();
- k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
index c0a0cef..6d4a28d 100644
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,16 +32,21 @@
void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode);
+}
+
+void CLNonMaximaSuppression3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode)
+{
auto k = arm_compute::support::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
- k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
if(border_mode != BorderMode::UNDEFINED)
{
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT);
+ _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT);
}
else
{
- _border_handler.configure(input, _kernel->border_size(), BorderMode::UNDEFINED);
+ _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED);
}
}
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 8489fab..f59a4ca 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,13 +39,18 @@
void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
+}
+
+void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
+{
ARM_COMPUTE_ERROR_ON(input == nullptr);
// Configure normalization kernel
- _norm_kernel.configure(input, output, norm_info);
+ _norm_kernel.configure(compile_context, input, output, norm_info);
// Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
- _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
+ _border_handler.configure(compile_context, input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
}
Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index c5de591..b03de64 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -25,7 +25,7 @@
#include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,13 @@
{
void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
+}
+
+void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+{
auto k = arm_compute::support::cpp14::make_unique<CLNormalizePlanarYUVLayerKernel>();
- k->configure(input, output, mean, std);
+ k->configure(compile_context, input, output, mean, std);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index a013a1f..5f7c170 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTensorAllocator.h"
#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
@@ -62,6 +62,15 @@
Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), old_pyramid, new_pyramid, old_points, new_points_estimates, new_points, termination, epsilon, num_iterations, window_dimension,
+ use_initial_estimate, border_mode, constant_border_value);
+}
+
+void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
+ const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
+ Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
+ BorderMode border_mode, uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
ARM_COMPUTE_ERROR_ON(nullptr == old_points);
@@ -122,18 +131,18 @@
_memory_group.manage(&_scharr_gy[i]);
// Init Scharr kernel
- _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
+ _func_scharr[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
// Init Lucas-Kanade init kernel
- _tracker_init_kernel[i].configure(old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
+ _tracker_init_kernel[i].configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
// Init Lucas-Kanade stage0 kernel
- _tracker_stage0_kernel[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
+ _tracker_stage0_kernel[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
_old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
window_dimension, i);
// Init Lucas-Kanade stage1 kernel
- _tracker_stage1_kernel[i].configure(new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+ _tracker_stage1_kernel[i].configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
termination, epsilon, num_iterations, window_dimension, i);
// Allocate intermediate buffers
@@ -142,7 +151,7 @@
}
// Finalize Lucas-Kanade
- _tracker_finalize_kernel.configure(_new_points_internal.get(), new_points);
+ _tracker_finalize_kernel.configure(compile_context, _new_points_internal.get(), new_points);
}
void CLOpticalFlow::run()
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index d463ef9..6543ab9 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,13 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
namespace
{
-void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
{
if(output->info()->dimension(0) > 1)
{
@@ -39,7 +39,7 @@
if(broadcasted_info->info()->dimension(0) == 1)
{
- border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+ border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE);
}
}
}
@@ -47,10 +47,15 @@
void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
+}
+
+void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
- k->configure(ArithmeticOperation::PRELU, input, alpha, output);
+ k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, output);
_kernel = std::move(k);
- configure_border_handler(_border_handler, _kernel->border_size(), input, alpha, output);
+ configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input, alpha, output);
}
Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 8f36a69..078bdbc 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,6 +32,11 @@
void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
+}
+
+void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
_perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
@@ -41,12 +46,12 @@
if(_perform_pad)
{
- _pad_kernel.configure(input, output, padding, constant_value, mode);
+ _pad_kernel.configure(compile_context, input, output, padding, constant_value, mode);
}
else
{
// Copy the input to the whole output if no padding is applied
- _copy_kernel.configure(input, output);
+ _copy_kernel.configure(compile_context, input, output);
}
}
Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index dd11df4..e6323ce 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -26,14 +26,19 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
#include "arm_compute/core/Error.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, perm);
+}
+
+void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
+{
auto k = arm_compute::support::cpp14::make_unique<CLPermuteKernel>();
- k->configure(input, output, perm);
+ k->configure(compile_context, input, output, perm);
_kernel = std::move(k);
}
@@ -42,4 +47,4 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(input, output, perm));
return Status{};
}
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
index cf3fa7e..b915104 100644
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ b/src/runtime/CL/functions/CLPhase.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLPhase.h"
#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,7 +32,12 @@
void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, phase_type);
+}
+
+void CLPhase::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
+{
auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
- k->configure(input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
+ k->configure(compile_context, input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 959464c..3c1a7de 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,17 +25,23 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
namespace arm_compute
{
void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+}
+
+void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
+ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
{
auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
- k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
+ k->configure(compile_context, input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
_kernel = std::move(k);
if(output->info()->dimension(0) > 1)
@@ -44,21 +50,26 @@
if(broadcasted_info->info()->dimension(0) == 1)
{
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
}
}
}
Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
- ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
{
- return CLPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
+ return CLPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
}
-void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
{
auto k = arm_compute::support::cpp14::make_unique<CLComplexPixelWiseMultiplicationKernel>();
- k->configure(input1, input2, output);
+ k->configure(compile_context, input1, input2, output, act_info);
_kernel = std::move(k);
if(output->info()->dimension(0) > 1)
@@ -67,13 +78,13 @@
if(broadcasted_info->info()->dimension(0) == 1)
{
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
}
}
}
-Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
- return CLComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
+ return CLComplexPixelWiseMultiplicationKernel::validate(input1, input2, output, act_info);
}
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 8831834..e7735b0 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -26,18 +26,22 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
-void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
+}
+
+void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
// Configure pooling kernel
auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>();
k->set_target(CLScheduler::get().target());
- k->configure(input, output, pool_info);
+ k->configure(compile_context, input, output, pool_info, indices);
_kernel = std::move(k);
const DataType data_type = input->info()->data_type();
@@ -75,14 +79,14 @@
default:
ARM_COMPUTE_ERROR("Data layout not supported");
}
- _border_handler.configure(input, _kernel->border_size(), border_mode, pixel_value);
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, pixel_value);
// Tune kernels
CLScheduler::get().tune_kernel_static(*_kernel);
}
-Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
{
- return CLPoolingLayerKernel::validate(input, output, pool_info);
+ return CLPoolingLayerKernel::validate(input, output, pool_info, indices);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index 4f6c969..d01b4c7 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,6 +40,11 @@
void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info);
+}
+
+void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+{
_min = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float));
_aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float));
if(!info.max_sizes().empty())
@@ -48,7 +53,7 @@
}
auto k = arm_compute::support::cpp14::make_unique<CLPriorBoxLayerKernel>();
- k->configure(input1, input2, output, info, &_min, &_max, &_aspect_ratios);
+ k->configure(compile_context, input1, input2, output, info, &_min, &_max, &_aspect_ratios);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
new file mode 100644
index 0000000..8b21424
--- /dev/null
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -0,0 +1,1123 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLQLSTMLayer.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+using namespace arm_compute::utils::info_helpers;
+namespace
+{
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
+ float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+ return Status{};
+}
+} // namespace
+
+Status CLQLSTMLayer::TensorCopyKernel::validate(const ITensorInfo &src, const ITensorInfo &dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(src.tensor_shape().num_dimensions() > max_dimension_supported);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst.tensor_shape().num_dimensions() > max_dimension_supported);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst.tensor_shape().y() != src.tensor_shape().y());
+ return Status{};
+}
+
+void CLQLSTMLayer::TensorCopyKernel::configure(ICLTensor &src, ICLTensor &dst)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::TensorCopyKernel::validate(*src.info(), *dst.info()));
+ _src = &src;
+ _dst = &dst;
+ _row_size = std::min(_src->info()->tensor_shape().x(), _dst->info()->tensor_shape().x());
+ _window = calculate_max_window(*_src->info(), Steps());
+}
+
+void CLQLSTMLayer::TensorCopyKernel::run()
+{
+ auto &q = CLScheduler::get().queue();
+
+ _src->map(q, true);
+ _dst->map(q, true);
+
+ Iterator input_iter{ _src, _window };
+ Iterator output_iter{ _dst, _window };
+
+ execute_window_loop(_window, [&](const Coordinates &)
+ {
+ memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
+ },
+ input_iter, output_iter);
+
+ _src->unmap(q);
+ _dst->unmap(q);
+}
+
+CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+{
+ _memory_group = MemoryGroup(std::move(memory_manager));
+}
+
+void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
+ const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,
+ CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,
+ const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+{
+ _memory_group.manage(mm_res);
+ _memory_group.manage(outstage_res);
+
+ mm_res->allocator()->init(mm_res_info);
+ outstage_res->allocator()->init(outstage_tensor_info);
+
+ // Configure matrix-multiplication
+ mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res);
+
+ // Configure output stage
+ quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+ outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info);
+ mm_res->allocator()->allocate();
+}
+
+void CLQLSTMLayer::configure(const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+ const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+ ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+ const LSTMParams<ICLTensor> &lstm_params)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params);
+}
+
+void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
+ const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+ const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+ ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+ const LSTMParams<ICLTensor> &lstm_params)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+ cell_state_out, output_state_out, output);
+
+ // Set lstm parameters
+ LSTMParams<ITensorInfo> lstm_params_info{};
+ build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
+
+ // Validate
+ ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+ recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+ cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
+ lstm_params_info));
+
+ const int batch_size = input->info()->dimension(1);
+ const int num_units = input_to_output_weights->info()->dimension(1);
+ const int output_size = output_state_out->info()->dimension(_out_state_output_size_dimension_idx);
+
+ const UniformQuantizationInfo qinput = input->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qcell_state_in = cell_state_in->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();
+
+ _projection_bias = lstm_params.projection_bias();
+ _input_to_forget_weights = input_to_forget_weights;
+ _input_to_cell_weights = input_to_cell_weights;
+ _input_to_output_weights = input_to_output_weights;
+ _recurrent_to_forget_weights = recurrent_to_forget_weights;
+ _recurrent_to_cell_weights = recurrent_to_cell_weights;
+ _recurrent_to_output_weights = recurrent_to_output_weights;
+ _projection_weights = lstm_params.projection_weights();
+
+ // Layer normalization
+ _has_layer_norm = lstm_params.use_layer_norm();
+ if(_has_layer_norm)
+ {
+ set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
+ set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
+ set_layer_norm_weight(lstm_params.input_layer_norm_weights(), LayerNormGate::Input);
+ set_layer_norm_weight(lstm_params.output_layer_norm_weights(), LayerNormGate::Output);
+
+ set_layer_norm_bias(forget_gate_bias, LayerNormGate::Forget);
+ set_layer_norm_bias(cell_bias, LayerNormGate::Cell);
+ set_layer_norm_bias(lstm_params.input_gate_bias(), LayerNormGate::Input);
+ set_layer_norm_bias(output_gate_bias, LayerNormGate::Output);
+ }
+
+ _has_cifg = lstm_params.has_cifg_opt();
+ _has_projection = lstm_params.has_projection();
+ _has_peephole = lstm_params.has_peephole_opt();
+
+ // Calculate and decompose effective scales for optimizing matmul calculation
+ const int32_t cell_shift = log2(qcell_state_in.scale);
+
+ // Calculate quantized parameters for clipping.
+ int16_t quantized_cell_clip = 0;
+ if(lstm_params.cell_clip() > 0.0f)
+ {
+ quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
+ }
+ _has_cell_clipping = quantized_cell_clip > 0;
+
+ // Precompute effective bias for optimizing the matmul computations.
+ if(!_has_cifg)
+ {
+ _input_to_input_weights = lstm_params.input_to_input_weights();
+ _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
+
+ _input_to_input_reduction.configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_input_reduction.configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ }
+ _input_to_forget_reduction.configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_forget_reduction.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_cell_reduction.configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_cell_reduction.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_output_reduction.configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_output_reduction.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ if(_has_projection)
+ {
+ _projection_reduction.configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+ }
+
+ // Pre-transpose weights to be used in GEMM.
+ _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed);
+ _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed);
+ _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed);
+ _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
+ _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
+ _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
+ if(!_has_cifg)
+ {
+ _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
+ _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+ }
+ if(_has_projection)
+ {
+ _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed);
+ }
+
+ GEMMLowpOutputStageInfo gemmlowp_info;
+ gemmlowp_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int16_t>::lowest();
+ gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int16_t>::max();
+ gemmlowp_info.output_data_type = DataType::QSYMM16;
+
+ const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
+ // Forget gate.
+ const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+ const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
+ input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
+ &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
+ mm_out_info, forget_gate_outstage_info);
+
+ const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+ configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
+ output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
+ &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
+ mm_out_info, forget_gate_outstage_info);
+
+ _accumulate_input_recurrent_forget.configure(compile_context, ArithmeticOperation::ADD, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+ ConvertPolicy::SATURATE);
+ _input_to_forget_outstage_res.allocator()->allocate();
+
+ if(_has_peephole)
+ {
+ _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+ _memory_group.manage(&_mul_cell_to_forget_res);
+ _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+ _memory_group.manage(&_cell_to_forget_outstage_res);
+ const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+ _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+ _mul_cell_to_forget_res.allocator()->allocate();
+ _accumulate_cell_forget.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+ ConvertPolicy::SATURATE);
+ _cell_to_forget_outstage_res.allocator()->allocate();
+ }
+
+ CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
+
+ if(_has_layer_norm)
+ {
+ configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res);
+ _recurrent_to_forget_outstage_res.allocator()->allocate();
+ forget_activation_input = &get_layer_norm_output(LayerNormGate::Forget);
+ }
+
+ // Output quantization info of Sigmoid and Tanh activations
+ const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
+
+ const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ _memory_group.manage(&_forget_gate);
+ _forget_gate.allocator()->init(forget_gate_info);
+ _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ forget_activation_input->allocator()->allocate();
+
+ // Modulation gate.
+ const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+ const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
+ input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
+ &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
+ mm_out_info, cell_outstage_info);
+
+ const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+ configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
+ output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
+ &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
+ mm_out_info, cell_outstage_info);
+
+ _accumulate_input_recurrent_modulation.configure(compile_context, ArithmeticOperation::ADD, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+ ConvertPolicy::SATURATE);
+ _input_to_cell_outstage_res.allocator()->allocate();
+
+ CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
+
+ if(_has_layer_norm)
+ {
+ configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res);
+ _recurrent_to_cell_outstage_res.allocator()->allocate();
+ cell_activation_input = &get_layer_norm_output(LayerNormGate::Cell);
+ }
+
+ const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ _memory_group.manage(&_cell_gate);
+ _cell_gate.allocator()->init(cell_gate_info);
+ _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+ cell_activation_input->allocator()->allocate();
+
+ // Input gate.
+ const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ _input_gate.allocator()->init(input_gate_info);
+ _memory_group.manage(&_input_gate);
+ if(_has_cifg)
+ {
+ _ones.allocator()->init(*_forget_gate.info());
+ _input_gate_sub.configure(compile_context, ArithmeticOperation::SUB, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
+ _ones.allocator()->allocate();
+ }
+ else
+ {
+ const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+ const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
+ input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
+ &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
+ mm_out_info, input_outstage_info);
+
+ const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
+ configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
+ output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
+ &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
+ mm_out_info, input_outstage_info);
+ _accumulate_input_recurrent_input.configure(compile_context, ArithmeticOperation::ADD, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,
+ ConvertPolicy::SATURATE);
+ _input_to_input_outstage_res.allocator()->allocate();
+
+ if(_has_peephole)
+ {
+ _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+ _memory_group.manage(&_mul_cell_to_input_res);
+ _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+ _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+ _memory_group.manage(&_cell_to_input_outstage_res);
+ _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+ _mul_cell_to_input_res.allocator()->allocate();
+ _accumulate_cell_input.configure(ArithmeticOperation::ADD, &_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+ _cell_to_input_outstage_res.allocator()->allocate();
+ }
+
+ CLTensor *input_activation_input = &_recurrent_to_input_outstage_res;
+
+ if(_has_layer_norm)
+ {
+ configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res);
+ _recurrent_to_input_outstage_res.allocator()->allocate();
+ input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
+ }
+
+ _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ input_activation_input->allocator()->allocate();
+ }
+ // Cell.
+ // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel
+ _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale;
+ const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
+ const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+ _memory_group.manage(&_mul_input_cell_res);
+ _mul_input_cell_res.allocator()->init(mul_input_cell_info);
+ _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _cell_gate.allocator()->allocate();
+ _add_forget_cell.configure(compile_context, ArithmeticOperation::ADD, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
+ _mul_input_cell_res.allocator()->allocate();
+ _forget_gate.allocator()->allocate();
+ if(_has_cell_clipping)
+ {
+ _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+ }
+ // Output gate.
+ const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+ const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
+ configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
+ input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
+ &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
+ mm_out_info, output_outstage_info);
+
+ const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+ configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
+ output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
+ &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
+ mm_out_info, output_outstage_info);
+
+ _accumulate_input_recurrent_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+ ConvertPolicy::SATURATE);
+ _input_to_output_outstage_res.allocator()->allocate();
+
+ if(_has_peephole)
+ {
+ // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel
+ // Here we are not using the output stage because all operations are done in float
+ _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
+ _memory_group.manage(&_mul_cell_to_output_res);
+ _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+
+ const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+ _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+ _memory_group.manage(&_cell_to_output_outstage_res);
+ _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+ _mul_cell_to_output_res.allocator()->allocate();
+
+ _accumulate_cell_to_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+ ConvertPolicy::SATURATE);
+ _cell_to_output_outstage_res.allocator()->allocate();
+ }
+
+ CLTensor *output_activation_input = &_recurrent_to_output_outstage_res;
+
+ if(_has_layer_norm)
+ {
+ configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res);
+ _recurrent_to_output_outstage_res.allocator()->allocate();
+ output_activation_input = &get_layer_norm_output(LayerNormGate::Output);
+ }
+
+ const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ _memory_group.manage(&_output_gate);
+ _output_gate.allocator()->init(output_gate_info);
+ _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ output_activation_input->allocator()->allocate();
+
+ // Hidden.
+ _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+ // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel
+ _memory_group.manage(&_hidden_mul_res);
+ const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
+ _hidden_mul_res.allocator()->init(hidden_mul_res);
+ _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _output_gate.allocator()->allocate();
+ _input_gate.allocator()->allocate();
+ const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
+ quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+ gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
+ gemmlowp_info.output_data_type = output_state_in->info()->data_type();
+
+ _projection_tensor_copy_required = (num_units != output_size);
+ ICLTensor *hidden_gate_result = output_state_out;
+
+ _memory_group.manage(&_hidden_gate);
+
+ if(_projection_tensor_copy_required)
+ {
+ _hidden_gate.allocator()->init(*output_state_out->info());
+ _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
+ hidden_gate_result = &_hidden_gate;
+ }
+
+ _hidden_outstage.configure(compile_context, &_hidden_mul_res, nullptr, hidden_gate_result, gemmlowp_info);
+ _hidden_mul_res.allocator()->allocate();
+
+ // Projection.
+ if(_has_projection)
+ {
+ const TensorInfo projection_outstage_info(*output_state_out->info());
+ const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+ const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+ gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+ gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+ gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+ gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
+
+ TensorInfo projection_mm_out_info{ mm_out_info };
+ projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
+
+ configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info,
+ hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
+ &_mm_projection_res, &_projection_outstage_res, projection_scale,
+ projection_mm_out_info, projection_outstage_info);
+
+ ICLTensor *accumulate_destination = output_state_out;
+
+ if(_projection_tensor_copy_required)
+ {
+ _hidden_gate.allocator()->allocate();
+ _projection_accumulate_res.allocator()->init(*output_state_out->info());
+ _projection_accumulate_res.info()->set_tensor_shape(_projection_outstage_res.info()->tensor_shape());
+ _projection_output_to_accumulate_copy.configure(*output_state_out, _projection_accumulate_res);
+ accumulate_destination = &_projection_accumulate_res;
+ }
+
+ _accumulate_projection.configure(compile_context, ArithmeticOperation::ADD, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+ _projection_outstage_res.allocator()->allocate();
+
+ if(_projection_tensor_copy_required)
+ {
+ _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
+ _projection_accumulate_res.allocator()->allocate();
+ }
+
+ int8_t quantized_projection_clip{ 0 };
+ if(lstm_params.projection_clip() > 0.0f)
+ {
+ quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+ }
+
+ if(quantized_projection_clip > 0)
+ {
+ _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
+ quantized_projection_clip));
+ _has_projection_clipping = true;
+ }
+ }
+ else
+ {
+ if(_projection_tensor_copy_required)
+ {
+ _hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
+ _hidden_gate.allocator()->allocate();
+ }
+ }
+
+ // Copy output_state_out to output
+ _copy_output.configure(compile_context, output_state_out, output);
+}
+
+Status CLQLSTMLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+ const LSTMParams<ITensorInfo> &lstm_params)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+ cell_state_out, output_state_out, output);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
+
+ const unsigned int input_size = input->dimension(0);
+ const unsigned int batch_size = input->dimension(1);
+ const unsigned int num_units = input_to_output_weights->dimension(1);
+ const unsigned int output_size = output_state_out->dimension(_out_state_output_size_dimension_idx);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(forget_gate_bias, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, cell_bias, output_gate_bias);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->dimension(0) != num_units);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->dimension(1) != batch_size);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(cell_state_in, 1, DataType::QSYMM16);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->dimension(0) != output_size);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->dimension(1) != batch_size);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
+
+ // Check whether peephole weights are all there or none
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+
+ if(!lstm_params.has_cifg_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+ }
+ }
+
+ const UniformQuantizationInfo qinput = input->quantization_info().uniform();
+ const UniformQuantizationInfo qcell_state_in = cell_state_in->quantization_info().uniform();
+ const UniformQuantizationInfo qoutput_state_in = output_state_in->quantization_info().uniform();
+
+ // Calculate and decompose effective scales for optimizing matmul calculation
+ const int32_t cell_shift = log2(qcell_state_in.scale);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_shift > -9);
+
+ // Calculate quantized parameters for clipping.
+ int16_t quantized_cell_clip = 0;
+ if(lstm_params.cell_clip() > 0.0f)
+ {
+ quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
+ }
+
+ // Precompute effective bias for optimizing the matmul computations.
+ const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
+ const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
+ if(!lstm_params.has_cifg_opt())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
+ true)));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ if(lstm_params.has_projection())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
+ lstm_params.hidden_state_zero(),
+ true)));
+ }
+
+ const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
+ const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+
+ // Validate weights transpose
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_cell_weights, &input_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_output_weights, &input_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
+ if(!lstm_params.has_cifg_opt())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
+ }
+ if(lstm_params.has_projection())
+ {
+ const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+ ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+ }
+
+ GEMMLowpOutputStageInfo gemmlowp_info;
+ gemmlowp_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int16_t>::lowest();
+ gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int16_t>::max();
+ gemmlowp_info.output_data_type = DataType::QSYMM16;
+
+ const bool has_layer_norm = lstm_params.use_layer_norm();
+
+ // Forget gate.
+ const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+ const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
+ const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+
+ const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
+ const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+ }
+
+ if(has_layer_norm)
+ {
+ const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
+ const ITensorInfo *b_info = forget_gate_bias;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(forget_outstage_info, *w_info, *b_info));
+ }
+
+ // Output quantization info of Sigmoid and Tanh activations
+ const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
+
+ const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+ // Modulation gate.
+ const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+ const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+ const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &input_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
+
+ if(has_layer_norm)
+ {
+ const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
+ const ITensorInfo *b_info = cell_bias;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));
+ }
+
+ const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+
+ // Input gate.
+ const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ if(lstm_params.has_cifg_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
+
+ const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+ const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
+
+ const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
+ const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+ }
+
+ if(has_layer_norm)
+ {
+ const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
+ const ITensorInfo *b_info = lstm_params.input_gate_bias();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
+ }
+ // Cell.
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+ if(quantized_cell_clip > 0)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
+ quantized_cell_clip)));
+ }
+ // Output gate.
+ const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+ const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
+
+ const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+ // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+ // Here we are not using the output stage because all operations are done in float
+ // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
+ // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+ }
+
+ if(has_layer_norm)
+ {
+ const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
+ const ITensorInfo *b_info = output_gate_bias;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(output_outstage_info, *w_info, *b_info));
+ }
+
+ const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+ // Hidden.
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+ const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
+ const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+ gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+
+ const bool projection_tensor_copy_required = num_units != output_size;
+
+ // Projection.
+ if(lstm_params.has_projection())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.projection_bias());
+
+ const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+ const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+ gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+ gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+ gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
+
+ const TensorInfo projection_outstage_info(*output_state_out);
+ const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+
+ TensorInfo projection_mm_out_info{ mm_out_info };
+ projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+ &projection_outstage_info));
+
+ if(projection_tensor_copy_required)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_out, projection_outstage_info));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+
+ if(projection_tensor_copy_required)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+ }
+
+ int8_t quantized_projection_clip{ 0 };
+ if(lstm_params.projection_clip() > 0.0f)
+ {
+ quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
+ }
+
+ if(quantized_projection_clip > 0)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
+ quantized_projection_clip)));
+ }
+ }
+ else
+ {
+ if(projection_tensor_copy_required)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
+ }
+ }
+
+ if(cell_state_out->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
+ }
+
+ if(output_state_out->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(output_state_out, output));
+ return Status{};
+}
+
+void CLQLSTMLayer::run()
+{
+ prepare();
+
+ // Acquire all the temporaries
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Forget gate.
+ _mm_input_to_forget.run();
+ _input_to_forget_outstage.run();
+
+ _mm_recurrent_to_forget.run();
+ _recurrent_to_forget_outstage.run();
+ CLScheduler::get().enqueue(_accumulate_input_recurrent_forget);
+
+ if(_has_peephole)
+ {
+ CLScheduler::get().enqueue(_pixelwise_mul_cell_to_forget);
+ _cell_to_forget_outstage.run();
+ CLScheduler::get().enqueue(_accumulate_cell_forget);
+ }
+
+ if(_has_layer_norm)
+ {
+ CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget));
+ }
+
+ _forget_gate_sigmoid.run();
+
+ // Modulation gate.
+ _mm_input_to_cell.run();
+ _input_to_cell_outstage.run();
+
+ _mm_recurrent_to_cell.run();
+ _recurrent_to_cell_outstage.run();
+ CLScheduler::get().enqueue(_accumulate_input_recurrent_modulation);
+
+ if(_has_layer_norm)
+ {
+ CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell));
+ }
+
+ _cell_gate_tanh.run();
+
+ // Input gate
+ if(_has_cifg)
+ {
+ CLScheduler::get().enqueue(_input_gate_sub);
+ }
+ else
+ {
+ _mm_input_to_input.run();
+ _input_to_input_outstage.run();
+ _mm_recurrent_to_input.run();
+ _recurrent_to_input_outstage.run();
+ CLScheduler::get().enqueue(_accumulate_input_recurrent_input);
+
+ if(_has_peephole)
+ {
+ CLScheduler::get().enqueue(_pixelwise_mul_cell_to_input);
+ _cell_to_input_outstage.run();
+ CLScheduler::get().enqueue(_accumulate_cell_input);
+ }
+
+ if(_has_layer_norm)
+ {
+ CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input));
+ }
+
+ _input_gate_sigmoid.run();
+ }
+
+ // Cell.
+ CLScheduler::get().enqueue(_pixelwise_mul_forget_cell);
+ CLScheduler::get().enqueue(_pixelwise_mul_input_cell);
+ CLScheduler::get().enqueue(_add_forget_cell);
+ if(_has_cell_clipping)
+ {
+ _cell_clip.run();
+ }
+
+ // Output gate.
+ _mm_input_to_output.run();
+ _input_to_output_outstage.run();
+ _mm_recurrent_to_output.run();
+ _recurrent_to_output_outstage.run();
+ CLScheduler::get().enqueue(_accumulate_input_recurrent_output);
+ if(_has_peephole)
+ {
+ CLScheduler::get().enqueue(_pixelwise_mul_cell_to_output);
+ _cell_to_output_outstage.run();
+ CLScheduler::get().enqueue(_accumulate_cell_to_output);
+ }
+
+ if(_has_layer_norm)
+ {
+ CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output));
+ }
+
+ _output_gate_sigmoid.run();
+
+ // Hidden.
+ _hidden_tanh.run();
+ CLScheduler::get().enqueue(_pixelwise_mul_hidden);
+ _hidden_outstage.run();
+
+ // Projection.
+ if(_has_projection)
+ {
+ _mm_projection.run();
+ _projection_outstage.run();
+
+ if(_projection_tensor_copy_required)
+ {
+ _projection_output_to_accumulate_copy.run();
+ }
+
+ CLScheduler::get().enqueue(_accumulate_projection);
+
+ if(_projection_tensor_copy_required)
+ {
+ _projection_accumulate_to_output_copy.run();
+ }
+
+ if(_has_projection_clipping)
+ {
+ _projection_clip.run();
+ }
+ }
+ else
+ {
+ if(_projection_tensor_copy_required)
+ {
+ _hidden_to_output_copy.run();
+ }
+ }
+
+ // Copy output_state_out to output
+ CLScheduler::get().enqueue(_copy_output);
+}
+
+void CLQLSTMLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Pre-transpose weights to be used in GEMM.
+ _input_to_forget_weights_transposed.allocator()->allocate();
+ _input_to_cell_weights_transposed.allocator()->allocate();
+ _input_to_output_weights_transposed.allocator()->allocate();
+ _recurrent_to_forget_weights_transposed.allocator()->allocate();
+ _recurrent_to_cell_weights_transposed.allocator()->allocate();
+ _recurrent_to_output_weights_transposed.allocator()->allocate();
+ _transpose_input_to_forget_weights.run();
+ _transpose_input_to_cell_weights.run();
+ _transpose_input_to_output_weights.run();
+ _transpose_recurrent_to_forget_weights.run();
+ _transpose_recurrent_to_cell_weights.run();
+ _transpose_recurrent_to_output_weights.run();
+
+ // Precompute effective biases
+ if(_has_cifg)
+ {
+ _ones.map(true);
+ std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+ _ones.unmap();
+ }
+ else
+ {
+ _input_to_input_eff_bias.allocator()->allocate();
+ _recurrent_to_input_eff_bias.allocator()->allocate();
+ CLScheduler::get().enqueue(_input_to_input_reduction);
+ CLScheduler::get().enqueue(_recurrent_to_input_reduction);
+
+ _input_to_input_weights_transposed.allocator()->allocate();
+ _recurrent_to_input_weights_transposed.allocator()->allocate();
+ _transpose_input_to_input_weights.run();
+ _transpose_recurrent_to_input_weights.run();
+ _input_to_input_weights->mark_as_unused();
+ _recurrent_to_input_weights->mark_as_unused();
+ }
+ _input_to_forget_eff_bias.allocator()->allocate();
+ _recurrent_to_forget_eff_bias.allocator()->allocate();
+ _input_to_cell_eff_bias.allocator()->allocate();
+ _recurrent_to_cell_eff_bias.allocator()->allocate();
+ _input_to_output_eff_bias.allocator()->allocate();
+ _recurrent_to_output_eff_bias.allocator()->allocate();
+ CLScheduler::get().enqueue(_input_to_forget_reduction);
+ CLScheduler::get().enqueue(_recurrent_to_forget_reduction);
+ CLScheduler::get().enqueue(_input_to_cell_reduction);
+ CLScheduler::get().enqueue(_recurrent_to_cell_reduction);
+ CLScheduler::get().enqueue(_input_to_output_reduction);
+ CLScheduler::get().enqueue(_recurrent_to_output_reduction);
+
+ if(_has_projection)
+ {
+ if(_projection_bias != nullptr)
+ {
+ _projection_eff_bias.allocator()->allocate();
+ CLScheduler::get().enqueue(_projection_reduction);
+ _projection_bias->mark_as_unused();
+ }
+
+ _projection_weights_transposed.allocator()->allocate();
+ _transpose_projection_weights.run();
+ _projection_weights->mark_as_unused();
+
+ if(!_projection_tensor_copy_required)
+ {
+ _hidden_gate.mark_as_unused();
+ _projection_accumulate_res.mark_as_unused();
+ }
+ }
+
+ // Mark weights as unused
+ _input_to_forget_weights->mark_as_unused();
+ _input_to_cell_weights->mark_as_unused();
+ _input_to_output_weights->mark_as_unused();
+ _recurrent_to_forget_weights->mark_as_unused();
+ _recurrent_to_cell_weights->mark_as_unused();
+ _recurrent_to_output_weights->mark_as_unused();
+
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index df10e1e..6239f27 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,14 +24,19 @@
#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLQuantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLQuantizationLayerKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 19eb69f..57b8d70 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,6 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
#include <utility>
@@ -69,6 +68,13 @@
void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
ActivationLayerInfo &info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info);
+}
+
+void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias,
+ ICLTensor *hidden_state,
+ ICLTensor *output, ActivationLayerInfo &info)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
@@ -82,23 +88,23 @@
// Manage intermediate buffers and configure
_memory_group.manage(&_fully_connected_out);
- _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+ _fully_connected_kernel.configure(compile_context, input, weights, bias, &_fully_connected_out);
_memory_group.manage(&_gemm_output);
- _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+ _gemm_state_f.configure(compile_context, hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
_add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
_memory_group.manage(&_add_output);
- _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+ _add_kernel.configure(compile_context, ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
_fully_connected_out.allocator()->allocate();
_gemm_output.allocator()->allocate();
- _activation_kernel.configure(&_add_output, hidden_state, info);
+ _activation_kernel.configure(compile_context, &_add_output, hidden_state, info);
_add_output.allocator()->allocate();
- _copy_kernel.configure(hidden_state, output);
+ _copy_kernel.configure(compile_context, hidden_state, output);
}
void CLRNNLayer::run()
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 5bfd594..43b58dd 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/ICLArray.h"
#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
@@ -38,9 +38,14 @@
void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
+}
+
+void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
// Configure ROI pooling kernel
auto k = arm_compute::support::cpp14::make_unique<CLROIAlignLayerKernel>();
- k->configure(input, rois, output, pool_info);
+ k->configure(compile_context, input, rois, output, pool_info);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 7bb4178..bb54cfa 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,14 +26,19 @@
#include "arm_compute/core/CL/ICLArray.h"
#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
+}
+
+void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
// Configure ROI pooling kernel
auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
- k->configure(input, rois, output, pool_info);
+ k->configure(compile_context, input, rois, output, pool_info);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index b2cd472..b29b03d 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,15 +28,20 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
void CLRange::configure(ICLTensor *output, const float start, const float end, const float step)
{
+ configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
+}
+
+void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+{
auto k = arm_compute::support::cpp14::make_unique<CLRangeKernel>();
k->set_target(CLScheduler::get().target());
- k->configure(output, start, end, step);
+ k->configure(compile_context, output, start, end, step);
_kernel = std::move(k);
// Tune kernels
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 9920617..ce44763 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -29,7 +29,6 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/ToolchainSupport.h"
namespace arm_compute
{
@@ -84,6 +83,7 @@
}
const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
return Status{};
}
@@ -94,6 +94,11 @@
}
void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output);
+}
+
+void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
+{
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
// Output auto inizialitation if not yet initialized
@@ -119,13 +124,13 @@
if(i == _reduction_ops - 1 && keep_dims)
{
- _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(compile_context, in, output, axis_local[i], ReductionOperation::MEAN_SUM);
}
else
{
_reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
_memory_group.manage(&_reduced_outs[i]);
- _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
}
}
@@ -148,7 +153,7 @@
out_shape.remove_dimension(axis_local[i] - i);
}
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
- _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+ _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], output);
}
}
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index fc902c4..b659ecf 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -34,7 +34,7 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/Utils.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
@@ -191,6 +191,11 @@
void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims);
+}
+
+void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_op = op;
_num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
@@ -218,7 +223,7 @@
_memory_group.manage(&_results_vector.back());
}
- _reduction_kernels_vector[0].configure(input, output_internal, axis, op, 0);
+ _reduction_kernels_vector[0].configure(compile_context, input, output_internal, axis, op, 0);
}
else
{
@@ -318,15 +323,15 @@
ARM_COMPUTE_ERROR("Not supported");
}
- _reduction_kernels_vector[0].configure(input, &_results_vector[0], axis, first_kernel_op);
- _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
+ _reduction_kernels_vector[0].configure(compile_context, input, &_results_vector[0], axis, first_kernel_op);
+ _border_handlers_vector[0].configure(compile_context, input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
// Apply ReductionOperation on intermediate stages
for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
{
_memory_group.manage(&_results_vector[i]);
- _reduction_kernels_vector[i].configure(&_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
- _border_handlers_vector[i].configure(&_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+ _reduction_kernels_vector[i].configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
+ _border_handlers_vector[i].configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
_results_vector[i - 1].allocator()->allocate();
}
@@ -339,14 +344,14 @@
_memory_group.manage(&_results_vector.back());
}
- _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
- _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+ _reduction_kernels_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
+ _border_handlers_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
_results_vector[last_stage - 1].allocator()->allocate();
}
if(_is_reshape_required)
{
- _reshape_kernel.configure(&_results_vector.back(), output);
+ _reshape_kernel.configure(compile_context, &_results_vector.back(), output);
_results_vector.back().allocator()->allocate();
}
}
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
index bc3fd4e..af241ec 100644
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,7 +29,7 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -37,6 +37,13 @@
void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_mode, constant_border_value);
+}
+
+void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
+ BorderMode border_mode,
+ uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
@@ -44,7 +51,7 @@
ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
auto k = arm_compute::support::cpp14::make_unique<CLRemapKernel>();
- k->configure(input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index 8e04d16..ea93314 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -36,8 +36,13 @@
void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
+}
+
+void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride)
+{
auto k = arm_compute::support::cpp14::make_unique<CLReorgLayerKernel>();
- k->configure(input, output, stride);
+ k->configure(compile_context, input, output, stride);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index b98a99d..13baedb 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,15 +25,20 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
/** [CLReshapeLayer snippet] **/
using namespace arm_compute;
void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLReshapeLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLReshapeLayerKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 0f86b9f..3c8bc15 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,14 +25,19 @@
#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
#include "arm_compute/core/Types.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+}
+
+void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+{
auto k = arm_compute::support::cpp14::make_unique<CLReverseKernel>();
- k->configure(input, output, axis);
+ k->configure(compile_context, input, output, axis);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index a355915..a9395bd 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -28,17 +28,23 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding,
bool align_corners)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners);
+}
+
+void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,
+ SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
+{
ARM_COMPUTE_UNUSED(use_padding);
auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
k->set_target(CLScheduler::get().target());
- k->configure(input, output, policy, border_mode, sampling_policy, align_corners);
+ k->configure(compile_context, input, output, policy, border_mode, sampling_policy, align_corners);
_kernel = std::move(k);
// Tune kernels
@@ -50,7 +56,7 @@
{
border_mode = BorderMode::CONSTANT;
}
- _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, constant_border_value);
}
Status CLScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy,
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
index 73f8673..faad542 100644
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
#include "arm_compute/core/PixelValue.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,13 @@
void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
+}
+
+void CLScharr3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLScharr3x3Kernel>();
- k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index 90c368e..7187010 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,8 +33,13 @@
{
void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output);
+}
+
+void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLSelectKernel>();
- k->configure(c, x, y, output);
+ k->configure(compile_context, c, x, y, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index f630853..e8cc0f5 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,19 +27,24 @@
#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
+}
+
+void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
// Get absolute end coordinates
const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
- k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
+ k->configure(compile_context, input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
index e227e58..c3604f9 100644
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
#include "arm_compute/core/PixelValue.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,13 @@
void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
+}
+
+void CLSobel3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3Kernel>();
- k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+ k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index 22fbef1..f8a33f3 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,6 +40,11 @@
void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
+}
+
+void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
const bool run_sobel_x = output_x != nullptr;
@@ -53,8 +58,8 @@
_tmp_y.allocator()->init(tensor_info);
_memory_group.manage(&_tmp_x);
_memory_group.manage(&_tmp_y);
- _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
- _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+ _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+ _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
_tmp_y.allocator()->allocate();
}
@@ -62,19 +67,19 @@
{
_tmp_x.allocator()->init(tensor_info);
_memory_group.manage(&_tmp_x);
- _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
- _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+ _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+ _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
}
else if(run_sobel_y)
{
_tmp_y.allocator()->init(tensor_info);
_memory_group.manage(&_tmp_y);
- _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
- _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+ _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+ _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_y.allocator()->allocate();
}
- _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
}
void CLSobel5x5::run()
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index 9b38f69..6d3c7f0 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,6 +40,11 @@
void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
+}
+
+void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
const bool run_sobel_x = output_x != nullptr;
@@ -53,8 +58,8 @@
_tmp_y.allocator()->init(tensor_info);
_memory_group.manage(&_tmp_x);
_memory_group.manage(&_tmp_y);
- _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
- _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+ _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+ _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
_tmp_y.allocator()->allocate();
}
@@ -62,19 +67,19 @@
{
_tmp_x.allocator()->init(tensor_info);
_memory_group.manage(&_tmp_x);
- _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
- _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+ _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+ _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
_tmp_x.allocator()->allocate();
}
else if(run_sobel_y)
{
_tmp_y.allocator()->init(tensor_info);
_memory_group.manage(&_tmp_y);
- _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
- _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+ _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+ _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
_tmp_y.allocator()->allocate();
}
- _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
}
void CLSobel7x7::run()
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index e01d2c7..b0b2117 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,6 +44,12 @@
template <bool IS_LOG>
void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
{
+ configure_reshape_input_kernel(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+}
+
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t axis)
+{
// Flatten the input
const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
@@ -56,13 +62,13 @@
if(axis != 3)
{
auto reshape_kernel_ptr = support::cpp14::make_unique<CLReshapeLayerKernel>();
- reshape_kernel_ptr->configure(input, &_input_flattened);
+ reshape_kernel_ptr->configure(compile_context, input, &_input_flattened);
_flatten_kernel_ptr = std::move(reshape_kernel_ptr);
}
else
{
auto flatten_kernel_ptr = support::cpp14::make_unique<CLFlattenLayerKernel>();
- flatten_kernel_ptr->configure(input, &_input_flattened);
+ flatten_kernel_ptr->configure(compile_context, input, &_input_flattened);
_flatten_kernel_ptr = std::move(flatten_kernel_ptr);
}
@@ -74,6 +80,12 @@
template <bool IS_LOG>
void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, beta, axis);
+}
+
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
+{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayerGeneric<IS_LOG>::validate(input->info(), output->info(), beta, axis));
@@ -123,7 +135,7 @@
softmax_info.input_data_type = input_2D->info()->data_type();
// Configure kernels
- _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, softmax_info);
+ _max_shift_exp_sum_kernel.configure(compile_context, input_2D, &_max, &_tmp, &_sum, softmax_info);
if(_needs_flattening)
{
@@ -131,10 +143,10 @@
_memory_group.manage(&_output_flattened);
// The normalization kernel stores the result in a flat output tensor
- _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, softmax_info);
+ _norm_kernel.configure(compile_context, &_tmp, &_sum, &_output_flattened, softmax_info);
// Reshape the flat output into a the requested (4D) output
- _reshape_kernel.configure(&_output_flattened, output);
+ _reshape_kernel.configure(compile_context, &_output_flattened, output);
// Allocate the intermediate flat tensors
_input_flattened.allocator()->allocate();
@@ -143,7 +155,7 @@
else
{
// Softmax 2D case
- _norm_kernel.configure(&_tmp, &_sum, output, softmax_info);
+ _norm_kernel.configure(compile_context, &_tmp, &_sum, output, softmax_info);
}
// Allocate intermediate buffers
@@ -203,7 +215,7 @@
}
template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::run()
+void CLSoftmaxLayerGeneric<IS_LOG>::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index fa6e82e..021d316 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,26 +39,37 @@
void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
+}
+
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
- _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+ _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
}
- _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+ _space_to_batch_kernel.configure(compile_context, input, block_shape, paddings, output);
}
void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+}
+
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
+ const Size2D &padding_right, ICLTensor *output)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
- _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+ _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
}
- _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+ _space_to_batch_kernel.configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
}
Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index f02a13f..a4ffefc 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,7 +39,12 @@
void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
{
- _space_to_depth_kernel.configure(input, output, block_shape);
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
+}
+
+void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+{
+ _space_to_depth_kernel.configure(compile_context, input, output, block_shape);
}
Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index 9fca52b..cdc44d8 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -30,7 +30,6 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
namespace arm_compute
{
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 04fe705..79c3fe5 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -33,8 +33,6 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
namespace arm_compute
{
CLStackLayer::CLStackLayer() // NOLINT
@@ -46,6 +44,11 @@
void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, axis, output);
+}
+
+void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+{
_num_inputs = input.size();
_stack_kernels.resize(_num_inputs);
@@ -54,7 +57,7 @@
for(unsigned int i = 0; i < _num_inputs; i++)
{
- _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+ _stack_kernels[i].configure(compile_context, input[i], axis_u, i, _num_inputs, output);
}
}
@@ -87,4 +90,4 @@
CLScheduler::get().enqueue(_stack_kernels[i], false);
}
}
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index e34f653..4547596 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
#include "arm_compute/core/Types.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
@@ -33,8 +33,15 @@
const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+
+void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
- k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ k->configure(compile_context, input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
index d187650..47e15d3 100644
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ b/src/runtime/CL/functions/CLTableLookup.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,7 +32,12 @@
void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, lut, output);
+}
+
+void CLTableLookup::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLTableLookupKernel>();
- k->configure(input, lut, output);
+ k->configure(compile_context, input, lut, output);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
index 1b30b77..57c9272 100644
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ b/src/runtime/CL/functions/CLThreshold.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLThreshold.h"
#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,7 +32,13 @@
void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, false_value, true_value, type, upper);
+}
+
+void CLThreshold::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type,
+ uint8_t upper)
+{
auto k = arm_compute::support::cpp14::make_unique<CLThresholdKernel>();
- k->configure(input, output, threshold, false_value, true_value, type, upper);
+ k->configure(compile_context, input, output, threshold, false_value, true_value, type, upper);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index ec6a4ab..178d7af 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,14 +24,19 @@
#include "arm_compute/runtime/CL/functions/CLTile.h"
#include "arm_compute/core/CL/kernels/CLTileKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
+}
+
+void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+{
auto k = arm_compute::support::cpp14::make_unique<CLTileKernel>();
- k->configure(input, output, multiples);
+ k->configure(compile_context, input, output, multiples);
_kernel = std::move(k);
}
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index ecb59f7..f5121d0 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLTranspose.h"
#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -32,12 +32,17 @@
void CLTranspose::configure(const ICLTensor *input, ICLTensor *output)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLTranspose::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
- k->configure(input, output);
+ k->configure(compile_context, input, output);
_kernel = std::move(k);
}
Status CLTranspose::validate(const ITensorInfo *input, const ITensorInfo *output)
{
return CLTransposeKernel::validate(input, output);
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index eb1dd8c..032fb99 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,6 +61,11 @@
void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis);
+}
+
+void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+{
std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
{
@@ -83,7 +88,7 @@
{
// Adjusts start and end coordinates to take a 2D slice at a time
slice_start.set(axis_u, slice);
- _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+ _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
}
}
diff --git a/src/runtime/CL/functions/CLUpsampleLayer.cpp b/src/runtime/CL/functions/CLUpsampleLayer.cpp
index 1dad325..dd04686 100644
--- a/src/runtime/CL/functions/CLUpsampleLayer.cpp
+++ b/src/runtime/CL/functions/CLUpsampleLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,10 +44,16 @@
void CLUpsampleLayer::configure(ICLTensor *input, ICLTensor *output,
const Size2D &info, const InterpolationPolicy upsampling_policy)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, info, upsampling_policy);
+}
+
+void CLUpsampleLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output,
+ const Size2D &info, const InterpolationPolicy upsampling_policy)
+{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_output = output;
- _upsample.configure(input, _output, info, upsampling_policy);
+ _upsample.configure(compile_context, input, _output, info, upsampling_policy);
}
void CLUpsampleLayer::run()
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
index 4286cf6..ce2171b 100644
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
#include "arm_compute/core/PixelValue.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,14 @@
void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value);
+}
+
+void CLWarpAffine::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
+ uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLWarpAffineKernel>();
- k->configure(input, output, matrix, policy);
+ k->configure(compile_context, input, output, matrix, policy);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
index 4603ee0..06c0661 100644
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
#include "arm_compute/core/PixelValue.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
#include <utility>
@@ -33,8 +33,14 @@
void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value);
+}
+
+void CLWarpPerspective::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
+ uint8_t constant_border_value)
+{
auto k = arm_compute::support::cpp14::make_unique<CLWarpPerspectiveKernel>();
- k->configure(input, output, matrix, policy);
+ k->configure(compile_context, input, output, matrix, policy);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
}
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index a5db977..132c3ee 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,6 +98,13 @@
void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
bool enable_fast_math)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+}
+
+void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
// Get indices for the width and height
const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
@@ -129,17 +136,18 @@
// Do not manage _input1 as it contains the weights
// Configure input transform
- _input_transform.configure(input, &_input0, winograd_info);
+ _input_transform.configure(compile_context, input, &_input0, winograd_info);
// Configure filter transform
- _filter_transform.configure(weights, &_input1, winograd_info);
+ _filter_transform.configure(compile_context, weights, &_input1, winograd_info);
// Configure batched matrix multiply
- _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, GEMMLowpOutputStageInfo(),
- (input->info()->data_type() == DataType::F16)));
+ _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+ GEMMLowpOutputStageInfo(),
+ (input->info()->data_type() == DataType::F16)));
// Configure output transform
- _output_transform.configure(&_batched_mm_output, biases, output, winograd_info, act_info);
+ _output_transform.configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info);
// Allocate temporary tensors
_input0.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 7361eb2..ae40076 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,16 +26,21 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
#include "arm_compute/core/Error.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, winograd_info);
+}
+
+void CLWinogradInputTransform::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
+{
auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
- k->configure(input, output, winograd_info);
+ k->configure(compile_context, input, output, winograd_info);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+ _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
}
Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
diff --git a/src/runtime/CL/functions/CLYOLOLayer.cpp b/src/runtime/CL/functions/CLYOLOLayer.cpp
index 5a612ba..0c0c106 100644
--- a/src/runtime/CL/functions/CLYOLOLayer.cpp
+++ b/src/runtime/CL/functions/CLYOLOLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,14 +25,19 @@
#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
#include "arm_compute/core/Types.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
void CLYOLOLayer::configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info, num_classes);
+}
+
+void CLYOLOLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
auto k = arm_compute::support::cpp14::make_unique<CLYOLOLayerKernel>();
- k->configure(input, output, act_info, num_classes);
+ k->configure(compile_context, input, output, act_info, num_classes);
_kernel = std::move(k);
}