arm_compute v20.05
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index e684eee..0a03497 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -71,6 +71,61 @@
const unsigned int _end;
};
+/** Given two dimensions and a maxium number of threads to utilise, calcualte the best
+ * combination of threads that fit in (mutliplied together) max_threads.
+ *
+ * This algorithm assumes that work in either of the dimensions is equally difficult
+ * to compute
+ *
+ * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension
+ */
+std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n)
+{
+ /*
+ * We want the same ratio of threads in M & N to the ratio of m and n problem size
+ *
+ * Therefore: mt/nt == m/n where mt*nt == max_threads
+ *
+ * max_threads/nt = mt & (max_threads/nt) * (m/n) = nt
+ * nt^2 = max_threads * (m/n)
+ * nt = sqrt( max_threads * (m/n) )
+ */
+ //ratio of m to n in problem dimensions
+ double ratio = m / static_cast<double>(n);
+
+ // nt = sqrt(max_threads * (m / n) )
+ const unsigned adjusted = std::round(
+ std::sqrt(max_threads * ratio));
+
+ //find the nearest factor of max_threads
+ for(unsigned i = 0; i!= adjusted; ++i)
+ {
+ //try down
+ const unsigned adj_down = adjusted - i;
+ if(max_threads % adj_down == 0)
+ {
+ return { adj_down, max_threads / adj_down };
+ }
+
+ //try up
+ const unsigned adj_up = adjusted + i;
+ if(max_threads % adj_up == 0)
+ {
+ return { adj_up, max_threads / adj_up };
+ }
+ }
+
+ //we didn't find anything so lets bail out with maxes biased to the largest dimension
+ if(m > n)
+ {
+ return{ std::min<unsigned>(m, max_threads), 1 };
+ }
+ else
+ {
+ return{ 1, std::min<unsigned>(n, max_threads) };
+ }
+}
+
/** Execute workloads[info.thread_id] first, then call the feeder to get the index of the next workload to run.
*
* Will run workloads until the feeder reaches the end of its range.
@@ -314,50 +369,95 @@
ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
const Window &max_window = kernel->window();
- const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
- const unsigned int num_threads = std::min(num_iterations, _impl->_num_threads);
- if(num_iterations == 0)
+ if(hints.split_dimension() == IScheduler::split_dimensions_all)
{
- return;
- }
+ /*
+ * if the split dim is size_t max then this signals we should parallelise over
+ * all dimensions
+ */
+ const std::size_t m = max_window.num_iterations(Window::DimX);
+ const std::size_t n = max_window.num_iterations(Window::DimY);
- if(!kernel->is_parallelisable() || num_threads == 1)
- {
- ThreadInfo info;
- info.cpu_info = &_cpu_info;
- kernel->run(max_window, info);
+ //in c++17 this can be swapped for auto [ m_threads, n_threads ] = split_2d(...
+ unsigned m_threads, n_threads;
+ std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n);
+
+ std::vector<IScheduler::Workload> workloads;
+ for(unsigned int ni = 0; ni != n_threads; ++ni)
+ {
+ for(unsigned int mi = 0; mi != m_threads; ++mi)
+ {
+ workloads.push_back(
+ [ ni, mi, m_threads, n_threads, &max_window, &kernel ]
+ (const ThreadInfo & info)
+ {
+ //narrow the window to our mi-ni workload
+ Window win = max_window.split_window(Window::DimX, mi, m_threads)
+ .split_window(Window::DimY, ni, n_threads);
+
+ win.validate();
+
+ Window thread_locator;
+ thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+ thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+
+ thread_locator.validate();
+
+ kernel->run_nd(win, info, thread_locator);
+ }
+ );
+ }
+ }
+ run_workloads(workloads);
}
else
{
- unsigned int num_windows = 0;
- switch(hints.strategy())
+ const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+ const unsigned int num_threads = std::min(num_iterations, _impl->_num_threads);
+
+ if(num_iterations == 0)
{
- case StrategyHint::STATIC:
- num_windows = num_threads;
- break;
- case StrategyHint::DYNAMIC:
+ return;
+ }
+
+ if(!kernel->is_parallelisable() || num_threads == 1)
+ {
+ ThreadInfo info;
+ info.cpu_info = &_cpu_info;
+ kernel->run(max_window, info);
+ }
+ else
+ {
+ unsigned int num_windows = 0;
+ switch(hints.strategy())
{
- const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
- // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
- num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
- break;
+ case StrategyHint::STATIC:
+ num_windows = num_threads;
+ break;
+ case StrategyHint::DYNAMIC:
+ {
+ const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
+ // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
+ num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unknown strategy");
}
- default:
- ARM_COMPUTE_ERROR("Unknown strategy");
- }
- std::vector<IScheduler::Workload> workloads(num_windows);
- for(unsigned int t = 0; t < num_windows; t++)
- {
- //Capture 't' by copy, all the other variables by reference:
- workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+ std::vector<IScheduler::Workload> workloads(num_windows);
+ for(unsigned int t = 0; t < num_windows; t++)
{
- Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
- win.validate();
- kernel->run(win, info);
- };
+ //Capture 't' by copy, all the other variables by reference:
+ workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+ {
+ Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
+ win.validate();
+ kernel->run(win, info);
+ };
+ }
+ run_workloads(workloads);
}
- run_workloads(workloads);
}
}
} // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
index d4be939..232f71d 100644
--- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,6 +49,13 @@
},
input_it, output_it);
break;
+ case DataType::QASYMM8_SIGNED:
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ *reinterpret_cast<float *>(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
+ },
+ input_it, output_it);
+ break;
case DataType::QASYMM16:
execute_window_loop(window, [&](const Coordinates &)
{
@@ -80,6 +87,13 @@
},
input_it, output_it);
break;
+ case DataType::QASYMM8_SIGNED:
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ *reinterpret_cast<int8_t *>(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+ },
+ input_it, output_it);
+ break;
case DataType::QASYMM16:
execute_window_loop(window, [&](const Coordinates &)
{
@@ -121,7 +135,7 @@
{
ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
- _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8;
+ _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
_scores_in = scores_in;
_boxes_in = boxes_in;
@@ -198,9 +212,9 @@
{
ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8;
+ const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
if(is_qasymm8)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16);
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index e0acf06..4ec0ab6 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,7 +26,6 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
-#include "support/ToolchainSupport.h"
#include <list>
diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index bc88f71..b3fc9c7 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,7 +26,6 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
-#include "support/ToolchainSupport.h"
#include <cstddef>
#include <ios>
@@ -41,7 +40,7 @@
DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize].");
if(input_box_encoding->num_dimensions() > 2)
@@ -91,6 +90,24 @@
return Status{};
}
+inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
+{
+ const float half_factor = 0.5f;
+
+ // BBox is equavalent to CenterSizeEncoding [y,x,h,w]
+ const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0];
+ const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1];
+ const float half_h = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
+ const float half_w = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
+
+ // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax]
+ auto decoded_ptr = reinterpret_cast<float *>(decoded_it.ptr());
+ *(decoded_ptr) = x_center - half_w; // xmin
+ *(1 + decoded_ptr) = y_center - half_h; // ymin
+ *(2 + decoded_ptr) = x_center + half_w; // xmax
+ *(3 + decoded_ptr) = y_center + half_h; // ymax
+}
+
/** Decode a bbox according to a anchors and scale info.
*
* @param[in] input_box_encoding The input prior bounding boxes.
@@ -102,8 +119,8 @@
{
const QuantizationInfo &qi_box = input_box_encoding->info()->quantization_info();
const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info();
- BBox box_centersize;
- BBox anchor;
+ BBox box_centersize{ {} };
+ BBox anchor{ {} };
Window win;
win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape());
@@ -113,11 +130,9 @@
Iterator anchor_it(input_anchors, win);
Iterator decoded_it(decoded_boxes, win);
- const float half_factor = 0.5f;
-
- execute_window_loop(win, [&](const Coordinates &)
+ if(input_box_encoding->info()->data_type() == DataType::QASYMM8)
{
- if(is_data_type_quantized(input_box_encoding->info()->data_type()))
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto box_ptr = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
@@ -127,29 +142,38 @@
anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)
});
- }
- else
+ DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+ },
+ box_it, anchor_it, decoded_it);
+ }
+ else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+ {
+ execute_window_loop(win, [&](const Coordinates &)
+ {
+ const auto box_ptr = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
+ const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
+ box_centersize = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
+ dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)
+ });
+ anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
+ dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)
+ });
+ DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+ },
+ box_it, anchor_it, decoded_it);
+ }
+ else
+ {
+ execute_window_loop(win, [&](const Coordinates &)
{
const auto box_ptr = reinterpret_cast<const float *>(box_it.ptr());
const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
box_centersize = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) });
anchor = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) });
- }
-
- // BBox is equavalent to CenterSizeEncoding [y,x,h,w]
- const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0];
- const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1];
- const float half_h = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
- const float half_w = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
-
- // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax]
- auto decoded_ptr = reinterpret_cast<float *>(decoded_it.ptr());
- *(decoded_ptr) = x_center - half_w; // xmin
- *(1 + decoded_ptr) = y_center - half_h; // ymin
- *(2 + decoded_ptr) = x_center + half_w; // xmax
- *(3 + decoded_ptr) = y_center + half_h; // ymax
- },
- box_it, anchor_it, decoded_it);
+ DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+ },
+ box_it, anchor_it, decoded_it);
+ }
}
void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx_boxes_after_nms, const std::vector<float> &result_scores_after_nms, const std::vector<int> &result_classes_after_nms,
@@ -264,12 +288,26 @@
// Decode scores if necessary
if(_dequantize_scores)
{
- for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+ if(_input_box_encoding->info()->data_type() == DataType::QASYMM8)
{
- for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+ for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
{
- *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
- dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+ for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+ {
+ *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
+ dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+ }
+ }
+ }
+ else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+ {
+ for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+ {
+ for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+ {
+ *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
+ dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+ }
}
}
}
diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
index f13674a..8856191 100644
--- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
+++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h"
#include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
diff --git a/src/runtime/CPP/functions/CPPPermute.cpp b/src/runtime/CPP/functions/CPPPermute.cpp
index bafcd2f..1cdfe92 100644
--- a/src/runtime/CPP/functions/CPPPermute.cpp
+++ b/src/runtime/CPP/functions/CPPPermute.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CPP/functions/CPPPermute.h"
#include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;
diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
index c4e1eab..eb0d560 100644
--- a/src/runtime/CPP/functions/CPPTopKV.cpp
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CPP/functions/CPPTopKV.h"
#include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
namespace arm_compute
{
diff --git a/src/runtime/CPP/functions/CPPUpsample.cpp b/src/runtime/CPP/functions/CPPUpsample.cpp
index 0fd7d93..a154b5e 100644
--- a/src/runtime/CPP/functions/CPPUpsample.cpp
+++ b/src/runtime/CPP/functions/CPPUpsample.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
#include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
using namespace arm_compute;