arm_compute v19.11
diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
index 2e10152..d4be939 100644
--- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,226 @@
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
-#include "support/ToolchainSupport.h"
+#include "arm_compute/runtime/Scheduler.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+void dequantize_tensor(const ITensor *input, ITensor *output)
+{
+    const UniformQuantizationInfo qinfo     = input->info()->quantization_info().uniform();
+    const DataType                data_type = input->info()->data_type();
+
+    Window window;
+    window.use_tensor_dimensions(input->info()->tensor_shape());
+    Iterator input_it(input, window);
+    Iterator output_it(output, window);
+
+    switch(data_type)
+    {
+        case DataType::QASYMM8:
+            execute_window_loop(window, [&](const Coordinates &)
+            {
+                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+            },
+            input_it, output_it);
+            break;
+        case DataType::QASYMM16:
+            execute_window_loop(window, [&](const Coordinates &)
+            {
+                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+            },
+            input_it, output_it);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+}
+
+void quantize_tensor(const ITensor *input, ITensor *output)
+{
+    const UniformQuantizationInfo qinfo     = output->info()->quantization_info().uniform();
+    const DataType                data_type = output->info()->data_type();
+
+    Window window;
+    window.use_tensor_dimensions(input->info()->tensor_shape());
+    Iterator input_it(input, window);
+    Iterator output_it(output, window);
+
+    switch(data_type)
+    {
+        case DataType::QASYMM8:
+            execute_window_loop(window, [&](const Coordinates &)
+            {
+                *reinterpret_cast<uint8_t *>(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+            },
+            input_it, output_it);
+            break;
+        case DataType::QASYMM16:
+            execute_window_loop(window, [&](const Coordinates &)
+            {
+                *reinterpret_cast<uint16_t *>(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+            },
+            input_it, output_it);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+}
+} // namespace
+
+CPPBoxWithNonMaximaSuppressionLimit::CPPBoxWithNonMaximaSuppressionLimit(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _box_with_nms_limit_kernel(),
+      _scores_in(),
+      _boxes_in(),
+      _batch_splits_in(),
+      _scores_out(),
+      _boxes_out(),
+      _classes(),
+      _batch_splits_out(),
+      _keeps(),
+      _scores_in_f32(),
+      _boxes_in_f32(),
+      _batch_splits_in_f32(),
+      _scores_out_f32(),
+      _boxes_out_f32(),
+      _classes_f32(),
+      _batch_splits_out_f32(),
+      _keeps_f32(),
+      _is_qasymm8(false)
+{
+}
 
 void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
                                                     ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CPPBoxWithNonMaximaSuppressionLimitKernel>();
-    k->configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
-    _kernel = std::move(k);
-}
\ No newline at end of file
+    ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
+
+    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8;
+
+    _scores_in        = scores_in;
+    _boxes_in         = boxes_in;
+    _batch_splits_in  = batch_splits_in;
+    _scores_out       = scores_out;
+    _boxes_out        = boxes_out;
+    _classes          = classes;
+    _batch_splits_out = batch_splits_out;
+    _keeps            = keeps;
+
+    if(_is_qasymm8)
+    {
+        // Manage intermediate buffers
+        _memory_group.manage(&_scores_in_f32);
+        _memory_group.manage(&_boxes_in_f32);
+        _memory_group.manage(&_scores_out_f32);
+        _memory_group.manage(&_boxes_out_f32);
+        _memory_group.manage(&_classes_f32);
+        _scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32));
+        _boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32));
+        if(batch_splits_in != nullptr)
+        {
+            _memory_group.manage(&_batch_splits_in_f32);
+            _batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32));
+        }
+        _scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32));
+        _boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32));
+        _classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32));
+        if(batch_splits_out != nullptr)
+        {
+            _memory_group.manage(&_batch_splits_out_f32);
+            _batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32));
+        }
+        if(keeps != nullptr)
+        {
+            _memory_group.manage(&_keeps_f32);
+            _keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32));
+        }
+
+        _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
+                                             &_scores_out_f32, &_boxes_out_f32, &_classes_f32,
+                                             (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr,
+                                             keeps_size, info);
+    }
+    else
+    {
+        _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+    }
+
+    if(_is_qasymm8)
+    {
+        _scores_in_f32.allocator()->allocate();
+        _boxes_in_f32.allocator()->allocate();
+        if(_batch_splits_in != nullptr)
+        {
+            _batch_splits_in_f32.allocator()->allocate();
+        }
+        _scores_out_f32.allocator()->allocate();
+        _boxes_out_f32.allocator()->allocate();
+        _classes_f32.allocator()->allocate();
+        if(batch_splits_out != nullptr)
+        {
+            _batch_splits_out_f32.allocator()->allocate();
+        }
+        if(keeps != nullptr)
+        {
+            _keeps_f32.allocator()->allocate();
+        }
+    }
+}
+
+Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes,
+                const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
+{
+    ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+
+    const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8;
+    if(is_qasymm8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(boxes_in, boxes_out);
+        const UniformQuantizationInfo boxes_qinfo = boxes_in->quantization_info().uniform();
+        ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
+        ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.offset != 0);
+    }
+
+    return Status{};
+}
+
+void CPPBoxWithNonMaximaSuppressionLimit::run()
+{
+    // Acquire all the temporaries
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    if(_is_qasymm8)
+    {
+        dequantize_tensor(_scores_in, &_scores_in_f32);
+        dequantize_tensor(_boxes_in, &_boxes_in_f32);
+        if(_batch_splits_in != nullptr)
+        {
+            dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32);
+        }
+    }
+
+    Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY);
+
+    if(_is_qasymm8)
+    {
+        quantize_tensor(&_scores_out_f32, _scores_out);
+        quantize_tensor(&_boxes_out_f32, _boxes_out);
+        quantize_tensor(&_classes_f32, _classes);
+        if(_batch_splits_out != nullptr)
+        {
+            quantize_tensor(&_batch_splits_out_f32, _batch_splits_out);
+        }
+        if(_keeps != nullptr)
+        {
+            quantize_tensor(&_keeps_f32, _keeps);
+        }
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 13a34b4..e0acf06 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -464,7 +464,7 @@
                 // Ignore background class.
                 continue;
             }
-            ARM_COMPUTE_ERROR_ON_MSG(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label);
+            ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label);
 
             const std::vector<BBox> &label_loc_preds = _all_location_predictions[i].find(label)->second;
 
@@ -497,7 +497,7 @@
             const int label = _info.share_location() ? -1 : c;
             if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
             {
-                ARM_COMPUTE_ERROR("Could not find predictions for label %d.", label);
+                ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
             }
             const std::vector<float> &scores = conf_scores.find(c)->second;
             const std::vector<BBox> &bboxes = decode_bboxes.find(label)->second;
@@ -518,7 +518,7 @@
 
                 if(conf_scores.find(label) == conf_scores.end())
                 {
-                    ARM_COMPUTE_ERROR("Could not find predictions for label %d.", label);
+                    ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
                 }
 
                 const std::vector<float> &scores = conf_scores.find(label)->second;
@@ -570,7 +570,7 @@
             {
                 // Either if there are no confidence predictions
                 // or there are no location predictions for current label.
-                ARM_COMPUTE_ERROR("Could not find predictions for the label %d.", label);
+                ARM_COMPUTE_ERROR_VAR("Could not find predictions for the label %d.", label);
             }
             const std::vector<BBox> &bboxes  = decode_bboxes.find(loc_label)->second;
             const std::vector<int> &indices = it.second;
diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index 2997b59..bc88f71 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
@@ -42,20 +42,20 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_class_score, input_anchors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize].");
     if(input_box_encoding->num_dimensions() > 2)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_class_score->dimension(0) != (info.num_classes() + 1),
                                     "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, "The anchors input tensor shape should be [4, N, kBatchSize].");
     if(input_anchors->num_dimensions() > 2)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox);
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1))
                                     || (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
@@ -156,7 +156,7 @@
                  std::vector<unsigned int> &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores,
                  ITensor *num_detection)
 {
-    // ymin,xmin,ymax,xmax -> xmin,ymin,xmax,ymax
+    // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax
     unsigned int i = 0;
     for(; i < num_output; ++i)
     {
@@ -183,8 +183,8 @@
 
 CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr),
-      _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _decoded_boxes(), _decoded_scores(), _selected_indices(),
-      _class_scores(), _input_scores_to_use(nullptr), _result_idx_boxes_after_nms(), _result_classes_after_nms(), _result_scores_after_nms(), _sorted_indices(), _box_scores()
+      _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _dequantize_scores(false), _decoded_boxes(), _decoded_scores(),
+      _selected_indices(), _class_scores(), _input_scores_to_use(nullptr)
 {
 }
 
@@ -214,15 +214,15 @@
     _info                        = info;
     _num_boxes                   = input_box_encoding->info()->dimension(1);
     _num_classes_with_background = _input_scores->info()->dimension(0);
+    _dequantize_scores           = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
 
     auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32));
     auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32));
-
+    auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32));
     const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes());
     auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32));
 
-    _input_scores_to_use = is_data_type_quantized(input_box_encoding->info()->data_type()) ? &_decoded_scores : _input_scores;
+    _input_scores_to_use = _dequantize_scores ? &_decoded_scores : _input_scores;
 
     // Manage intermediate buffers
     _memory_group.manage(&_decoded_boxes);
@@ -236,21 +236,6 @@
     _decoded_scores.allocator()->allocate();
     _selected_indices.allocator()->allocate();
     _class_scores.allocator()->allocate();
-
-    if(info.use_regular_nms())
-    {
-        _result_idx_boxes_after_nms.reserve(_info.detection_per_class() * _info.num_classes());
-        _result_classes_after_nms.reserve(_info.detection_per_class() * _info.num_classes());
-        _result_scores_after_nms.reserve(_info.detection_per_class() * _info.num_classes());
-    }
-    else
-    {
-        _result_scores_after_nms.reserve(num_classes_per_box * _num_boxes);
-        _result_classes_after_nms.reserve(num_classes_per_box * _num_boxes);
-        _result_scores_after_nms.reserve(num_classes_per_box * _num_boxes);
-        _box_scores.reserve(_num_boxes);
-    }
-    _sorted_indices.resize(info.use_regular_nms() ? info.max_detections() : info.num_classes());
 }
 
 Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
@@ -277,7 +262,7 @@
     DecodeCenterSizeBoxes(_input_box_encoding, _input_anchors, _info, &_decoded_boxes);
 
     // Decode scores if necessary
-    if(is_data_type_quantized(_input_box_encoding->info()->data_type()))
+    if(_dequantize_scores)
     {
         for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
         {
@@ -288,9 +273,15 @@
             }
         }
     }
+
     // Regular NMS
     if(_info.use_regular_nms())
     {
+        std::vector<int>          result_idx_boxes_after_nms;
+        std::vector<int>          result_classes_after_nms;
+        std::vector<float>        result_scores_after_nms;
+        std::vector<unsigned int> sorted_indices;
+
         for(unsigned int c = 0; c < num_classes; ++c)
         {
             // For each boxes get scores of the boxes for the class c
@@ -299,6 +290,8 @@
                 *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(i)))) =
                     *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
             }
+
+            // Run Non-maxima Suppression
             _nms.run();
 
             for(unsigned int i = 0; i < _info.detection_per_class(); ++i)
@@ -307,67 +300,73 @@
                 if(selected_index == -1)
                 {
                     // Nms will return -1 for all the last M-elements not valid
-                    continue;
+                    break;
                 }
-                _result_idx_boxes_after_nms.emplace_back(selected_index);
-                _result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
-                _result_classes_after_nms.emplace_back(c);
+                result_idx_boxes_after_nms.emplace_back(selected_index);
+                result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
+                result_classes_after_nms.emplace_back(c);
             }
         }
 
         // We select the max detection numbers of the highest score of all classes
-        const auto num_selected = _result_idx_boxes_after_nms.size();
+        const auto num_selected = result_scores_after_nms.size();
         const auto num_output   = std::min<unsigned int>(max_detections, num_selected);
 
         // Sort selected indices based on result scores
-        std::iota(_sorted_indices.begin(), _sorted_indices.end(), 0);
-        std::partial_sort(_sorted_indices.data(),
-                          _sorted_indices.data() + num_output,
-                          _sorted_indices.data() + num_selected,
+        sorted_indices.resize(num_selected);
+        std::iota(sorted_indices.begin(), sorted_indices.end(), 0);
+        std::partial_sort(sorted_indices.data(),
+                          sorted_indices.data() + num_output,
+                          sorted_indices.data() + num_selected,
                           [&](unsigned int first, unsigned int second)
         {
 
-            return _result_scores_after_nms[first] > _result_scores_after_nms[second];
+            return result_scores_after_nms[first] > result_scores_after_nms[second];
         });
 
-        SaveOutputs(&_decoded_boxes, _result_idx_boxes_after_nms, _result_scores_after_nms, _result_classes_after_nms,
-                    _sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+        SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices,
+                    num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
     }
     // Fast NMS
     else
     {
         const unsigned int num_classes_per_box = std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
-        for(unsigned int b = 0, index = 0; b < _num_boxes; ++b)
+        std::vector<float> max_scores;
+        std::vector<int>   box_indices;
+        std::vector<int>   max_score_classes;
+
+        for(unsigned int b = 0; b < _num_boxes; ++b)
         {
-            _box_scores.clear();
-            _sorted_indices.clear();
+            std::vector<float> box_scores;
             for(unsigned int c = 0; c < num_classes; ++c)
             {
-                _box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
-                _sorted_indices.push_back(c);
+                box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
             }
-            std::partial_sort(_sorted_indices.data(),
-                              _sorted_indices.data() + num_classes_per_box,
-                              _sorted_indices.data() + num_classes,
+
+            std::vector<unsigned int> max_score_indices;
+            max_score_indices.resize(_info.num_classes());
+            std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0);
+            std::partial_sort(max_score_indices.data(),
+                              max_score_indices.data() + num_classes_per_box,
+                              max_score_indices.data() + num_classes,
                               [&](unsigned int first, unsigned int second)
             {
-                return _box_scores[first] > _box_scores[second];
+                return box_scores[first] > box_scores[second];
             });
 
-            for(unsigned int i = 0; i < num_classes_per_box; ++i, ++index)
+            for(unsigned int i = 0; i < num_classes_per_box; ++i)
             {
-                const float score_to_add                                                       = _box_scores[_sorted_indices[i]];
-                *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(index)))) = score_to_add;
-                _result_scores_after_nms.emplace_back(score_to_add);
-                _result_idx_boxes_after_nms.emplace_back(b);
-                _result_classes_after_nms.emplace_back(_sorted_indices[i]);
+                const float score_to_add                                                                             = box_scores[max_score_indices[i]];
+                *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add;
+                max_scores.emplace_back(score_to_add);
+                box_indices.emplace_back(b);
+                max_score_classes.emplace_back(max_score_indices[i]);
             }
         }
 
-        // Run NMS
+        // Run Non-maxima Suppression
         _nms.run();
-
-        _sorted_indices.clear();
+        std::vector<unsigned int> selected_indices;
         for(unsigned int i = 0; i < max_detections; ++i)
         {
             // NMS returns M valid indices, the not valid tail is filled with -1
@@ -376,13 +375,13 @@
                 // Nms will return -1 for all the last M-elements not valid
                 break;
             }
-            _sorted_indices.emplace_back(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))));
+            selected_indices.emplace_back(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))));
         }
         // We select the max detection numbers of the highest score of all classes
-        const auto num_output = std::min<unsigned int>(_info.max_detections(), _sorted_indices.size());
+        const auto num_output = std::min<unsigned int>(_info.max_detections(), selected_indices.size());
 
-        SaveOutputs(&_decoded_boxes, _result_idx_boxes_after_nms, _result_scores_after_nms, _result_classes_after_nms,
-                    _sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+        SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices,
+                    num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute