arm_compute v18.03

Change-Id: I8f9a2a9d32a6cab019b8504d313216f28671f9f5
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 2ddd59e..a3a0f28 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp
@@ -49,6 +49,8 @@
         return input_valid_region;
     }
 
+    ARM_COMPUTE_UNUSED(window);
+
     Coordinates &anchor = input_valid_region.anchor;
     TensorShape &shape  = input_valid_region.shape;
 
@@ -68,14 +70,6 @@
         shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
     }
 
-    // For higher dimension use the intersection of the window size and the
-    // valid region of the input
-    for(size_t d = 2; d < _info->num_dimensions(); ++d)
-    {
-        anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
-        shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
-    }
-
     return input_valid_region;
 }
 
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 06d10a4..f75a90a 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -788,4 +788,4 @@
     {
         return CL_OUT_OF_RESOURCES;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index eacfa4c..8ccec06 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -144,7 +144,9 @@
 
 Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), convolved_dims).first);
     return Status{};
 }
 
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index 3fe956d..4b4814f 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -116,7 +116,9 @@
 
 Status CLGEMMMatrixAdditionKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const float beta)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, output, beta));
+    ARM_COMPUTE_RETURN_ERROR_ON(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
     return Status{};
 }
 
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index 12c2d58..da34448 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp
@@ -54,10 +54,10 @@
                                                          DataType::U16, DataType::S16, DataType::QS16,
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->num_dimensions() < 3), "Invalid input size!");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((perm.num_dimensions() == 3 && !(perm[0] == 2 && perm[1] == 0 && perm[2] == 1) && !(perm[0] == 1 && perm[1] == 2 && perm[2] == 0)) || (perm.num_dimensions() == 4
-                                     && !(perm[0] == 3 && perm[1] == 2 && perm[2] == 0 && perm[3] == 1))),
-                                    "Only [2, 0, 1],[1, 2, 0] and [3, 2, 0, 1] permutation is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm != PermutationVector{ 2U, 0U, 1U })
+                                    && (perm != PermutationVector{ 1U, 2U, 0U })
+                                    && (perm != PermutationVector{ 3U, 2U, 0U, 1U }),
+                                    "Only [2, 0, 1], [1, 2, 0] and [3, 2, 0, 1] permutation is supported");
 
     const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
 
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index a6dbfe6..ec12515 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -574,13 +574,13 @@
 template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
-    Iterator                  input(_input, window);
-    Iterator                  output(_output, window);
-    const QuantizationInfo    qi_in   = _input->info()->quantization_info();
-    const QuantizationInfo    qi_out  = _output->info()->quantization_info();
-    const qasymm8x16_t        a       = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset));
-    const qasymm8x16_t        b       = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset));
-    static const qasymm8x16_t CONST_0 = vdupq_n_u8(sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset));
+    Iterator               input(_input, window);
+    Iterator               output(_output, window);
+    const QuantizationInfo qi_in   = _input->info()->quantization_info();
+    const QuantizationInfo qi_out  = _output->info()->quantization_info();
+    const qasymm8x16_t     a       = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset));
+    const qasymm8x16_t     b       = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset));
+    const qasymm8x16_t     CONST_0 = vdupq_n_u8(sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset));
 
     // Initialise scale/offset for re-quantization
     float       s  = qi_in.scale / qi_out.scale;
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
index 8f47736..609823b 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
@@ -41,9 +41,6 @@
   return 170 * tile_M * tile_N * shape.n_channels;
 }
 
-// Instantiate cost methods
-template int Transform::ops_performed(const Tensor4DShape&);
-
 /* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use
  * enough tiles to cover the output space each output tile may contain up to 3
  * padded values to the right and bottom columns or rows of the tile, e.g.:
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index b6c6822..2fe3a90 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -33,8 +33,19 @@
 #include "arm_compute/runtime/Tensor.h"
 #include "support/ToolchainSupport.h"
 
+#include <sys/stat.h>
+
 using namespace arm_compute::graph;
 
+namespace
+{
+bool file_exists(const std::string &filename)
+{
+    std::ifstream file(filename);
+    return file.good();
+}
+
+} // namespace
 struct Stage
 {
     ITensorObject                          *_input;
@@ -69,9 +80,13 @@
     GraphHints     _previous_hints{};
 };
 
+static const std::string tuner_data_filename = "acl_tuner.csv";
 Graph::~Graph() //NOLINT
 {
-    //Can't use =default because the destructor must be defined after Graph::Private's definition
+    if(_pimpl->_tuner.tune_new_kernels() && !_pimpl->_tuner.lws_table().empty())
+    {
+        _pimpl->_tuner.save_to_file(tuner_data_filename);
+    }
 }
 
 Graph::Graph()
@@ -85,17 +100,14 @@
     // Check if OpenCL is available and initialize the scheduler
     if(opencl_is_available())
     {
-        if(use_cl_tuner)
+        if(_pimpl->_tuner.lws_table().empty() && file_exists(tuner_data_filename))
         {
-            arm_compute::CLScheduler::get().default_init(&_pimpl->_tuner);
+            _pimpl->_tuner.load_from_file(tuner_data_filename);
         }
-        else
-        {
-            arm_compute::CLScheduler::get().default_init();
-        }
+        _pimpl->_tuner.set_tune_new_kernels(use_cl_tuner);
+        arm_compute::CLScheduler::get().default_init(&_pimpl->_tuner);
     }
 }
-
 void Graph::run()
 {
     while(true)
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index cf5b5bc..df8e255 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -27,26 +27,134 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include <cerrno>
+#include <fstream>
+#include <iostream>
 #include <limits>
 #include <string>
 
 using namespace arm_compute;
 
-CLTuner::CLTuner()
-    : real_function(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event()
+namespace
+{
+/* Function to be used to intercept kernel enqueues and store their OpenCL Event */
+class Interceptor
+{
+public:
+    explicit Interceptor(CLTuner &tuner);
+
+    /** clEnqueueNDRangeKernel interface
+     *
+     * @param[in] command_queue           A valid command-queue. The kernel will be queued for execution on the device associated with command_queue.
+     * @param[in] kernel                  A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same.
+     * @param[in] work_dim                The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS.
+     * @param[in] gwo                     Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0).
+     * @param[in] gws                     Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function.
+     * @param[in] lws                     Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group
+     * @param[in] num_events_in_wait_list Number of events in the waiting list
+     * @param[in] event_wait_list         Event waiting list
+     * @param[in] event                   OpenCL kernel event
+     *
+     * @return the OpenCL status
+     */
+    cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+                      const cl_event *event_wait_list, cl_event *event);
+
+private:
+    CLTuner &_tuner;
+};
+
+Interceptor::Interceptor(CLTuner &tuner)
+    : _tuner(tuner)
 {
 }
 
+cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+                               const cl_event *event_wait_list, cl_event *event)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
+    ARM_COMPUTE_UNUSED(event);
+    if(_tuner.kernel_event_is_set())
+    {
+        // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
+        return CL_SUCCESS;
+    }
+    cl_event tmp;
+    cl_int   retval = _tuner.real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+
+    // Set OpenCL event
+    _tuner.set_cl_kernel_event(tmp);
+
+    return retval;
+}
+
+} // namespace
+
+CLTuner::CLTuner(bool tune_new_kernels)
+    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
+{
+}
+
+bool CLTuner::kernel_event_is_set() const
+{
+    return _kernel_event() != nullptr;
+}
 void CLTuner::set_cl_kernel_event(cl_event kernel_event)
 {
     _kernel_event = kernel_event;
 }
 
+void CLTuner::set_tune_new_kernels(bool tune_new_kernels)
+{
+    _tune_new_kernels = tune_new_kernels;
+}
+bool CLTuner::tune_new_kernels() const
+{
+    return _tune_new_kernels;
+}
+
 void CLTuner::tune_kernel(ICLKernel &kernel)
 {
-    if(real_function == nullptr)
+    // Get the configuration ID from the kernel
+    const std::string &config_id = kernel.config_id();
+
+    // Check if we need to find the Optimal LWS. If config_id is equal to default_config_id, the kernel does not require to be tuned
+    if(config_id != arm_compute::default_config_id)
     {
-        real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
+        auto p = _lws_table.find(config_id);
+
+        if(p == _lws_table.end())
+        {
+            if(_tune_new_kernels)
+            {
+                // Find the optimal LWS for the kernel
+                cl::NDRange opt_lws = find_optimal_lws(kernel);
+
+                // Insert the optimal LWS in the table
+                add_lws_to_table(config_id, opt_lws);
+
+                // Set Local-Workgroup-Size
+                kernel.set_lws_hint(opt_lws);
+            }
+        }
+        else
+        {
+            // Set Local-Workgroup-Size
+            kernel.set_lws_hint(p->second);
+        }
+    }
+}
+
+void CLTuner::add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws)
+{
+    _lws_table.emplace(kernel_id, optimal_lws);
+}
+
+cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
+{
+    if(real_clEnqueueNDRangeKernel == nullptr)
+    {
+        real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
 
         // Get the default queue
         _queue = CLScheduler::get().queue();
@@ -64,42 +172,6 @@
             _queue_profiler = _queue;
         }
     }
-
-    // Get the configuration ID from the kernel
-    const std::string &config_id = kernel.config_id();
-
-    // Check if we need to find the Optimal LWS. If config_id is equal to default_config_id, the kernel does not require to be tuned
-    if(config_id != arm_compute::default_config_id)
-    {
-        auto p = _lws_table.find(config_id);
-
-        if(p == _lws_table.end())
-        {
-            // Set profiler queue
-            CLScheduler::get().set_queue(_queue_profiler);
-
-            // Find the optimal LWS for the kernel
-            cl::NDRange opt_lws = find_optimal_lws(kernel);
-
-            // Insert the optimal LWS in the table
-            _lws_table.emplace(config_id, opt_lws);
-
-            // Set Local-Workgroup-Size
-            kernel.set_lws_hint(opt_lws);
-
-            // Restore queue
-            CLScheduler::get().set_queue(_queue);
-        }
-        else
-        {
-            // Set Local-Workgroup-Size
-            kernel.set_lws_hint(p->second);
-        }
-    }
-}
-
-cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
-{
     // Start intercepting enqueues:
     CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this);
 
@@ -123,11 +195,12 @@
         // Run the kernel
         kernel.run(kernel.window(), _queue_profiler);
 
-        CLScheduler::get().sync();
+        _queue_profiler.finish();
 
         const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
         const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
         const cl_ulong diff  = end - start;
+        _kernel_event        = nullptr;
 
         min_exec_time = diff;
     }
@@ -153,11 +226,12 @@
                 // Run the kernel
                 kernel.run(kernel.window(), _queue_profiler);
 
-                CLScheduler::get().sync();
+                _queue_profiler.finish();
 
                 const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
                 const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
                 const cl_ulong diff  = end - start;
+                _kernel_event        = nullptr;
 
                 // Check the execution time
                 if(diff < min_exec_time)
@@ -170,7 +244,7 @@
     }
 
     // Restore real function
-    CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_function;
+    CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_clEnqueueNDRangeKernel;
 
     return opt_lws;
 }
@@ -181,27 +255,58 @@
     _lws_table = lws_table;
 }
 
-const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table()
+const std::unordered_map<std::string, cl::NDRange> &CLTuner::lws_table() const
 {
     return _lws_table;
 }
 
-Interceptor::Interceptor(CLTuner &tuner)
-    : _tuner(tuner)
+void CLTuner::load_from_file(const std::string &filename)
 {
+    std::ifstream fs;
+    fs.exceptions(std::ifstream::badbit);
+    fs.open(filename, std::ios::in);
+    if(!fs.is_open())
+    {
+        ARM_COMPUTE_ERROR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno);
+    }
+    std::string line;
+    while(!std::getline(fs, line).fail())
+    {
+        std::istringstream ss(line);
+        std::string        token;
+        if(std::getline(ss, token, ';').fail())
+        {
+            ARM_COMPUTE_ERROR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str());
+        }
+        std::string kernel_id = token;
+        cl::NDRange lws(1, 1, 1);
+        for(int i = 0; i < 3; i++)
+        {
+            if(std::getline(ss, token, ';').fail())
+            {
+                ARM_COMPUTE_ERROR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str());
+            }
+            lws.get()[i] = support::cpp11::stoi(token);
+        }
+
+        // If all dimensions are 0: reset to NullRange (i.e nullptr)
+        if(lws[0] == 0 && lws[1] == 0 && lws[2] == 0)
+        {
+            lws = cl::NullRange;
+        }
+        add_lws_to_table(kernel_id, lws);
+    }
+    fs.close();
 }
 
-cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
-                               const cl_event *event_wait_list, cl_event *event)
+void CLTuner::save_to_file(const std::string &filename) const
 {
-    ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
-    ARM_COMPUTE_UNUSED(event);
-
-    cl_event tmp;
-    cl_int   retval = _tuner.real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
-
-    // Set OpenCL event
-    _tuner.set_cl_kernel_event(tmp);
-
-    return retval;
+    std::ofstream fs;
+    fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+    fs.open(filename, std::ios::out);
+    for(auto kernel_data : _lws_table)
+    {
+        fs << kernel_data.first << ";" << kernel_data.second[0] << ";" << kernel_data.second[1] << ";" << kernel_data.second[2] << std::endl;
+    }
+    fs.close();
 }
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index 05b5d54..26d46a4 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,6 +64,9 @@
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }
+
+    // Set valid region from shape
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
 }
 
 void CLDepthConcatenateLayer::run()
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 4e4dd87..168ed6e 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -163,7 +163,7 @@
 
 void CPPScheduler::set_num_threads(unsigned int num_threads)
 {
-    _num_threads = num_threads == 0 ? std::thread::hardware_concurrency() : num_threads;
+    _num_threads = num_threads == 0 ? num_threads_hint() : num_threads;
     _threads.resize(_num_threads - 1);
 }
 
diff --git a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
index 437c941..930f8d5 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,9 @@
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }
+
+    // Set valid region from shape
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
 }
 
 void NEDepthConcatenateLayer::run()