arm_compute v20.05
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index e684eee..0a03497 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,6 +71,61 @@
     const unsigned int _end;
 };
 
+/** Given two dimensions and a maxium number of threads to utilise, calcualte the best
+ * combination of threads that fit in (mutliplied together) max_threads.
+ *
+ * This algorithm assumes that work in either of the dimensions is equally difficult
+ * to compute
+ *
+ * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension
+ */
+std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n)
+{
+    /*
+     * We want the same ratio of threads in M & N to the ratio of m and n problem size
+     *
+     * Therefore:    mt/nt == m/n    where mt*nt == max_threads
+     *
+     *             max_threads/nt = mt    &    (max_threads/nt) * (m/n) = nt
+     *          nt^2 = max_threads * (m/n)
+     *          nt = sqrt( max_threads * (m/n) )
+     */
+    //ratio of m to n in problem dimensions
+    double ratio = m / static_cast<double>(n);
+
+    // nt = sqrt(max_threads * (m / n) )
+    const unsigned adjusted = std::round(
+                    std::sqrt(max_threads * ratio));
+
+    //find the nearest factor of max_threads
+    for(unsigned i = 0; i!= adjusted; ++i)
+    {
+        //try down
+        const unsigned adj_down = adjusted - i;
+        if(max_threads % adj_down == 0)
+        {
+            return { adj_down, max_threads / adj_down };
+        }
+
+        //try up
+        const unsigned adj_up = adjusted + i;
+        if(max_threads % adj_up == 0)
+        {
+            return { adj_up, max_threads / adj_up };
+        }
+    }
+
+    //we didn't find anything so lets bail out with maxes biased to the largest dimension
+    if(m > n)
+    {
+         return{ std::min<unsigned>(m, max_threads), 1 };
+    }
+    else
+    {
+        return{ 1, std::min<unsigned>(n, max_threads) };
+    }
+}
+
 /** Execute workloads[info.thread_id] first, then call the feeder to get the index of the next workload to run.
  *
  * Will run workloads until the feeder reaches the end of its range.
@@ -314,50 +369,95 @@
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
     const Window      &max_window     = kernel->window();
-    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-    const unsigned int num_threads    = std::min(num_iterations, _impl->_num_threads);
 
-    if(num_iterations == 0)
+    if(hints.split_dimension() == IScheduler::split_dimensions_all)
     {
-        return;
-    }
+        /*
+         * if the split dim is size_t max then this signals we should parallelise over
+         * all dimensions
+         */
+        const std::size_t m = max_window.num_iterations(Window::DimX);
+        const std::size_t n = max_window.num_iterations(Window::DimY);
 
-    if(!kernel->is_parallelisable() || num_threads == 1)
-    {
-        ThreadInfo info;
-        info.cpu_info = &_cpu_info;
-        kernel->run(max_window, info);
+       //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
+        unsigned m_threads, n_threads;
+        std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n);
+
+        std::vector<IScheduler::Workload> workloads;
+        for(unsigned int ni  = 0; ni != n_threads; ++ni)
+        {
+            for(unsigned int mi  = 0; mi != m_threads; ++mi)
+            {
+                workloads.push_back(
+                    [ ni, mi, m_threads, n_threads, &max_window, &kernel ]
+                    (const ThreadInfo & info)
+                    {
+                        //narrow the window to our mi-ni workload
+                        Window win = max_window.split_window(Window::DimX, mi, m_threads)
+                                               .split_window(Window::DimY, ni, n_threads);
+
+                        win.validate();
+
+                        Window thread_locator;
+                        thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+                        thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+
+                        thread_locator.validate();
+
+                        kernel->run_nd(win, info, thread_locator);
+                    }
+                );
+            }
+        }
+        run_workloads(workloads);
     }
     else
     {
-        unsigned int num_windows = 0;
-        switch(hints.strategy())
+        const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+        const unsigned int num_threads    = std::min(num_iterations, _impl->_num_threads);
+
+        if(num_iterations == 0)
         {
-            case StrategyHint::STATIC:
-                num_windows = num_threads;
-                break;
-            case StrategyHint::DYNAMIC:
+            return;
+        }
+
+        if(!kernel->is_parallelisable() || num_threads == 1)
+        {
+            ThreadInfo info;
+            info.cpu_info = &_cpu_info;
+            kernel->run(max_window, info);
+        }
+        else
+        {
+            unsigned int num_windows = 0;
+            switch(hints.strategy())
             {
-                const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
-                // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
-                num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
-                break;
+                case StrategyHint::STATIC:
+                    num_windows = num_threads;
+                    break;
+                case StrategyHint::DYNAMIC:
+                {
+                    const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
+                    // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
+                    num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Unknown strategy");
             }
-            default:
-                ARM_COMPUTE_ERROR("Unknown strategy");
-        }
-        std::vector<IScheduler::Workload> workloads(num_windows);
-        for(unsigned int t = 0; t < num_windows; t++)
-        {
-            //Capture 't' by copy, all the other variables by reference:
-            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+            std::vector<IScheduler::Workload> workloads(num_windows);
+            for(unsigned int t = 0; t < num_windows; t++)
             {
-                Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
-                win.validate();
-                kernel->run(win, info);
-            };
+                //Capture 't' by copy, all the other variables by reference:
+                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+                {
+                    Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
+                    win.validate();
+                    kernel->run(win, info);
+                };
+            }
+            run_workloads(workloads);
         }
-        run_workloads(workloads);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
index d4be939..232f71d 100644
--- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,6 +49,13 @@
             },
             input_it, output_it);
             break;
+        case DataType::QASYMM8_SIGNED:
+            execute_window_loop(window, [&](const Coordinates &)
+            {
+                *reinterpret_cast<float *>(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
+            },
+            input_it, output_it);
+            break;
         case DataType::QASYMM16:
             execute_window_loop(window, [&](const Coordinates &)
             {
@@ -80,6 +87,13 @@
             },
             input_it, output_it);
             break;
+        case DataType::QASYMM8_SIGNED:
+            execute_window_loop(window, [&](const Coordinates &)
+            {
+                *reinterpret_cast<int8_t *>(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+            },
+            input_it, output_it);
+            break;
         case DataType::QASYMM16:
             execute_window_loop(window, [&](const Coordinates &)
             {
@@ -121,7 +135,7 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
 
-    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8;
+    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
 
     _scores_in        = scores_in;
     _boxes_in         = boxes_in;
@@ -198,9 +212,9 @@
 {
     ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
 
-    const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8;
+    const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
     if(is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16);
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index e0acf06..4ec0ab6 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,6 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "support/ToolchainSupport.h"
 
 #include <list>
 
diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index bc88f71..b3fc9c7 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,6 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 #include <ios>
@@ -41,7 +40,7 @@
                           DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize].");
     if(input_box_encoding->num_dimensions() > 2)
@@ -91,6 +90,24 @@
     return Status{};
 }
 
+inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
+{
+    const float half_factor = 0.5f;
+
+    // BBox is equavalent to CenterSizeEncoding [y,x,h,w]
+    const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0];
+    const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1];
+    const float half_h   = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
+    const float half_w   = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
+
+    // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax]
+    auto decoded_ptr   = reinterpret_cast<float *>(decoded_it.ptr());
+    *(decoded_ptr)     = x_center - half_w; // xmin
+    *(1 + decoded_ptr) = y_center - half_h; // ymin
+    *(2 + decoded_ptr) = x_center + half_w; // xmax
+    *(3 + decoded_ptr) = y_center + half_h; // ymax
+}
+
 /** Decode a bbox according to a anchors and scale info.
  *
  * @param[in]  input_box_encoding The input prior bounding boxes.
@@ -102,8 +119,8 @@
 {
     const QuantizationInfo &qi_box     = input_box_encoding->info()->quantization_info();
     const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info();
-    BBox                    box_centersize;
-    BBox                    anchor;
+    BBox                    box_centersize{ {} };
+    BBox                    anchor{ {} };
 
     Window win;
     win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape());
@@ -113,11 +130,9 @@
     Iterator anchor_it(input_anchors, win);
     Iterator decoded_it(decoded_boxes, win);
 
-    const float half_factor = 0.5f;
-
-    execute_window_loop(win, [&](const Coordinates &)
+    if(input_box_encoding->info()->data_type() == DataType::QASYMM8)
     {
-        if(is_data_type_quantized(input_box_encoding->info()->data_type()))
+        execute_window_loop(win, [&](const Coordinates &)
         {
             const auto box_ptr    = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
             const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
@@ -127,29 +142,38 @@
             anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
                             dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)
                           });
-        }
-        else
+            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+        },
+        box_it, anchor_it, decoded_it);
+    }
+    else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+    {
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto box_ptr    = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
+            const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
+            box_centersize        = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
+                                           dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)
+                                         });
+            anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
+                            dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)
+                          });
+            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+        },
+        box_it, anchor_it, decoded_it);
+    }
+    else
+    {
+        execute_window_loop(win, [&](const Coordinates &)
         {
             const auto box_ptr    = reinterpret_cast<const float *>(box_it.ptr());
             const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
             box_centersize        = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) });
             anchor                = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) });
-        }
-
-        // BBox is equavalent to CenterSizeEncoding [y,x,h,w]
-        const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0];
-        const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1];
-        const float half_h   = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
-        const float half_w   = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
-
-        // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax]
-        auto decoded_ptr   = reinterpret_cast<float *>(decoded_it.ptr());
-        *(decoded_ptr)     = x_center - half_w; // xmin
-        *(1 + decoded_ptr) = y_center - half_h; // ymin
-        *(2 + decoded_ptr) = x_center + half_w; // xmax
-        *(3 + decoded_ptr) = y_center + half_h; // ymax
-    },
-    box_it, anchor_it, decoded_it);
+            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+        },
+        box_it, anchor_it, decoded_it);
+    }
 }
 
 void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx_boxes_after_nms, const std::vector<float> &result_scores_after_nms, const std::vector<int> &result_classes_after_nms,
@@ -264,12 +288,26 @@
     // Decode scores if necessary
     if(_dequantize_scores)
     {
-        for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+        if(_input_box_encoding->info()->data_type() == DataType::QASYMM8)
         {
-            for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+            for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
             {
-                *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
-                    dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+                for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+                {
+                    *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
+                        dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+                }
+            }
+        }
+        else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+        {
+            for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+            {
+                for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+                {
+                    *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
+                        dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+                }
             }
         }
     }
diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
index f13674a..8856191 100644
--- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
+++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h"
 
 #include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CPP/functions/CPPPermute.cpp b/src/runtime/CPP/functions/CPPPermute.cpp
index bafcd2f..1cdfe92 100644
--- a/src/runtime/CPP/functions/CPPPermute.cpp
+++ b/src/runtime/CPP/functions/CPPPermute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CPP/functions/CPPPermute.h"
 
 #include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
index c4e1eab..eb0d560 100644
--- a/src/runtime/CPP/functions/CPPTopKV.cpp
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CPP/functions/CPPTopKV.h"
 
 #include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CPP/functions/CPPUpsample.cpp b/src/runtime/CPP/functions/CPPUpsample.cpp
index 0fd7d93..a154b5e 100644
--- a/src/runtime/CPP/functions/CPPUpsample.cpp
+++ b/src/runtime/CPP/functions/CPPUpsample.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
 
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;