arm_compute v18.11

commit: b9abeae0897bef74553ba9800c4ff5f74131c5b4 [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Thu Nov 22 11:58:08 2018 +0000
committer: Jenkins <bsgcomp@arm.com> Thu Nov 22 11:58:08 2018 +0000
tree: 69f512b9d2d7d9169960592cca948dc6e8e5b936
parent: 52ba29e936b8e711e8acdfe819e36f884d4f3fe1 [diff]
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index e3bbea2..8982e77 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox

@@ -1,3 +1,26 @@
+///
+/// Copyright (c) 2017-2018 ARM Limited.
+///
+/// SPDX-License-Identifier: MIT
+///
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to
+/// deal in the Software without restriction, including without limitation the
+/// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+/// sell copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+///
+/// The above copyright notice and this permission notice shall be included in all
+/// copies or substantial portions of the Software.
+///
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
+///
 namespace arm_compute
 {
 /** @mainpage Introduction
@@ -26,7 +49,7 @@
 For each release we provide some pre-built binaries of the library [here](https://github.com/ARM-software/ComputeLibrary/releases)
 
 These binaries have been built using the following toolchains:
-            - Linux armv7a: gcc-linaro-arm-linux-gnueabihf-4.9-2014.07_linux
+            - Linux armv7a: gcc-linaro-4.9-2016.02-x86_64_arm-linux-gnueabihf
             - Linux arm64-v8a: gcc-linaro-4.9-2016.02-x86_64_aarch64-linux-gnu
             - Android armv7a: clang++ / libc++ NDK r17b
             - Android am64-v8a: clang++ / libc++ NDK r17b
@@ -215,6 +238,69 @@
 
 @subsection S2_2_changelog Changelog
 
+v18.11 Public major release
+ - Various bug fixes.
+ - Various optimisations.
+ - New Neon kernels / functions:
+    - @ref NEChannelShuffleLayer / @ref NEChannelShuffleLayerKernel
+    - @ref NEReduceMean
+    - @ref NEReorgLayer / @ref NEReorgLayerKernel
+    - @ref NEPriorBoxLayer / @ref NEPriorBoxLayerKernel
+    - @ref NEUpsampleLayer / @ref NEUpsampleLayerKernel
+    - @ref NEYOLOLayer / @ref NEYOLOLayerKernel
+ - New OpenCL kernels / functions:
+    - @ref CLBatchToSpaceLayer / @ref CLBatchToSpaceLayerKernel
+    - @ref CLBoundingBoxTransform / @ref CLBoundingBoxTransformKernel
+    - @ref CLComputeAllAnchorsKernel
+    - @ref CLGenerateProposalsLayer
+    - @ref CLNormalizePlanarYUVLayer / @ref CLNormalizePlanarYUVLayerKernel
+    - @ref CLReorgLayer / @ref CLReorgLayerKernel
+    - @ref CLSpaceToBatchLayer / @ref CLSpaceToBatchLayerKernel
+    - @ref CLPadLayer
+    - @ref CLReduceMean
+    - @ref CLPriorBoxLayer / @ref CLPriorBoxLayerKernel
+    - @ref CLROIAlignLayer / @ref CLROIAlignLayerKernel
+    - @ref CLSlice
+    - @ref CLSplit
+    - @ref CLStridedSlice / @ref CLStridedSliceKernel
+    - @ref CLUpsampleLayer / @ref CLUpsampleLayerKernel
+    - @ref CLYOLOLayer / @ref CLYOLOLayerKernel
+ - New CPP kernels / functions:
+    - @ref CPPBoxWithNonMaximaSuppressionLimit / @ref CPPBoxWithNonMaximaSuppressionLimitKernel
+ - Added the validate method in:
+    - @ref NEDepthConvertLayer
+    - @ref NEFloor / @ref CLFloor
+    - @ref NEGEMMMatrixAdditionKernel
+    - @ref NEReshapeLayer / @ref CLReshapeLayer
+    - @ref CLScale
+ - Added new examples:
+    - graph_shufflenet.cpp
+    - graph_yolov3.cpp
+ - Added documentation for add a new function or kernel.
+ - Improved doxygen documentation adding a list of the existing functions.
+ - Add 4D tensors support to
+    - @ref CLWidthConcatenateLayer
+    - @ref CLFlattenLayer
+    - @ref CLSoftmaxLayer
+ - Add dot product support for @ref CLDepthwiseConvolutionLayer3x3NHWCKernel non-unit stride
+ - Add SVE support
+ - Fused batch normalization into convolution layer weights in @ref CLFuseBatchNormalization
+ - Fuses activation in @ref CLDepthwiseConvolutionLayer3x3NCHWKernel, @ref CLDepthwiseConvolutionLayer3x3NHWCKernel and @ref NEGEMMConvolutionLayer
+ - Added NHWC data layout support to:
+    - @ref CLChannelShuffleLayer
+    - @ref CLDeconvolutionLayer
+    - @ref CLL2NormalizeLayer
+ - Added QASYMM8 support to the following kernels:
+    - @ref CLScaleKernel
+    - @ref NEDepthwiseConvolutionLayer3x3Kernel
+    - @ref CLPixelWiseMultiplicationKernel
+ - Added FP16 support to the following kernels:
+    - @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
+    - @ref NEDepthwiseConvolutionLayer3x3Kernel
+    - @ref CLNormalizePlanarYUVLayerKernel
+    - @ref CLWinogradConvolutionLayer (5x5 kernel)
+ - More tests added to both validation and benchmarking suites.
+
 v18.08 Public major release
  - Various bug fixes.
  - Various optimisations.
@@ -509,7 +595,7 @@
  -  @ref NEFillArrayKernel
  -  @ref NEGaussianPyramidHorKernel
  -  @ref NEGaussianPyramidVertKernel
- -  @ref NEHarrisScoreFP16Kernel
+ -  NEHarrisScoreFP16Kernel
  -  @ref NEHarrisScoreKernel
  -  @ref NEHOGDetectorKernel
  -  @ref NELogits1DMaxKernel

diff --git a/docs/01_library.dox b/docs/01_library.dox
index bd4b300..67adf9c 100644
--- a/docs/01_library.dox
+++ b/docs/01_library.dox

@@ -1,3 +1,26 @@
+///
+/// Copyright (c) 2017-2018 ARM Limited.
+///
+/// SPDX-License-Identifier: MIT
+///
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to
+/// deal in the Software without restriction, including without limitation the
+/// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+/// sell copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+///
+/// The above copyright notice and this permission notice shall be included in all
+/// copies or substantial portions of the Software.
+///
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
+///
 namespace arm_compute
 {
 /**
@@ -342,8 +365,6 @@
 - @ref ILifetimeManager that keeps track of the lifetime of the registered objects of the memory groups and given an @ref IAllocator creates an appropriate memory pool that fulfils the memory requirements of all the registered memory groups.
 - @ref IPoolManager that safely manages the registered memory pools.
 
-@note @ref IMemoryManager::finalize should be called once the configuration of all the memory groups, kernels and functions is done, so that the memory manager can allocate the appropriate backing memory.
-
 @note @ref BlobLifetimeManager is currently implemented which models the memory requirements as a vector of distinct memory blobs.
 
 @subsection S4_7_2_working_with_memory_manager Working with the Memory Manager
@@ -385,11 +406,9 @@
 
 @warning The configuration step should be done sequentially by a single thread so that all the lifetimes are captured correclty.
 
-When configuration of all the operations is finished then the memory manager have to be finalized:
+When configuration of all the operations is finished then the memory manager have to be populated:
 @code{.cpp}
-mm->set_allocator(&allocator); // Set allocator to use
-mm->set_set_num_pools(2);      // Set number of pools to create in case parallel operations can be run
-mm->finalize();                // Finalize memory manager (Object lifetime check, Memory pool creation etc)
+mm->populate(&allocator), 2 /* num_pools */); // Populate memory manager pools
 @endcode
 
 Finally, during execution of the pipeline the memory of the appropriate memory group should be requested before running:
@@ -422,10 +441,8 @@
 conv1.configure(...);
 conv2.configure(...);
 
-// Finalize memory manager
-mm->set_allocator(&allocator); // Set allocator to use
-mm->set_set_num_pools(1);      // Set number of pools to create in case parallel operations can be run
-mm->finalize();                // Finalize memory manager (Object lifetime check, Memory pool creation etc)
+// Populate memory manager
+mm->populate(&allocator), 1 /* num_pools */); // Populate memory manager pools
 
 // Run layers (Memory will be recycled for internal buffers for conv1 and conv2
 conv1.run();

diff --git a/docs/02_tests.dox b/docs/02_tests.dox
index 9c8d125..2363ae0 100644
--- a/docs/02_tests.dox
+++ b/docs/02_tests.dox

@@ -1,3 +1,26 @@
+///
+/// Copyright (c) 2017-2018 ARM Limited.
+///
+/// SPDX-License-Identifier: MIT
+///
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to
+/// deal in the Software without restriction, including without limitation the
+/// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+/// sell copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+///
+/// The above copyright notice and this permission notice shall be included in all
+/// copies or substantial portions of the Software.
+///
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
+///
 namespace arm_compute
 {
 namespace test

diff --git a/docs/03_scripts.dox b/docs/03_scripts.dox
index eede8b5..11deea2 100644
--- a/docs/03_scripts.dox
+++ b/docs/03_scripts.dox

@@ -1,3 +1,28 @@
+///
+/// Copyright (c) 2017-2018 ARM Limited.
+///
+/// SPDX-License-Identifier: MIT
+///
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to
+/// deal in the Software without restriction, including without limitation the
+/// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+/// sell copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+///
+/// The above copyright notice and this permission notice shall be included in all
+/// copies or substantial portions of the Software.
+///
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
+///
+namespace arm_compute
+{
 /**
 @page data_import Importing data from existing models
 
@@ -90,3 +115,4 @@
 The arm_compute::utils::load_trained_data shows how one could load
 the weights and biases into tensor from the .npy file by the help of Accessor.
 */
+}
\ No newline at end of file

diff --git a/docs/04_adding_operator.dox b/docs/04_adding_operator.dox
new file mode 100644
index 0000000..5f03785
--- /dev/null
+++ b/docs/04_adding_operator.dox

@@ -0,0 +1,332 @@
+///
+/// Copyright (c) 2018 ARM Limited.
+///
+/// SPDX-License-Identifier: MIT
+///
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to
+/// deal in the Software without restriction, including without limitation the
+/// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+/// sell copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+///
+/// The above copyright notice and this permission notice shall be included in all
+/// copies or substantial portions of the Software.
+///
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
+///
+
+namespace arm_compute
+{
+/**
+@page add_operator Adding new operators
+
+@tableofcontents
+
+@section S4_1_introduction Introduction
+In ACL there are two main parts or modules:
+- The core library consists of a low-level collection of algorithms implemented in C++ and optimized for Arm CPUs and GPUs. The core module is designed to be embedded in other projects and it doesn't perform any memory management or scheduling.
+- The runtime library is a wrapper of the core library and provides other additional features like memory management, multithreaded execution of workloads and allocation of the intermediate tensors.
+
+The library can be integrated in an existing external library or application that provides its own scheduler or a specific memory manager. In that case, the right solution is to use only the core library which means that the user must also manage all the memory allocation not only for the input/output tensor but also for the intermediate tensors/variables necessary. On the other hand, if the user doesn't want to care about allocation and multithreading then the right choice is to use the functions from the runtime library.
+
+Apart from these components that get linked into the application, the sources also include the validation test suite and the C++ reference implementations against which all the operators are validated.
+
+
+@section S4_1_supporting_new_operators Supporting new operators
+
+Following are the steps involved in adding support for a new operator in ACL
+- Add new data types (if required)
+- Add the kernel to the core library.
+- Add the function to the runtime library.
+- Add validation tests.
+    - Add the reference implementation.
+    - Add the fixture
+    - register the tests.
+
+@subsection S4_1_1_add_datatypes Adding new data types
+
+The ACL declares a few new datatypes related to ACL's domain, kernels, and functions in the library process Tensors and Images (Computer Vision functions). Tensors are multi-dimensional arrays with a maximum of Coordinates::num_max_dimensions dimensions; depending on the number of dimensions tensors can be interpreted as various objects. A scalar can be represented as a zero-dimensional tensor and a vector of numbers can be represented as a one-dimensional tensor. Furthermore, an image is just a 2D tensor, a 3D tensor can be seen as an array of images and a 4D tensor as a 2D array of images, etc.
+All the datatype classes or structures are grouped in the core library folder arm_compute/core  like the @ref ITensor, @ref ITensorInfo (all the information of a tensor), TensorShape and simpler types are in arm_compute/core/Types.h.
+
+If an operator handles a new datatype, it must be added to the library. While adding a new data type to the library, it's necessary to implement the function to enable printing, the to_string() method and the output stream insertion (<<) operator. Every datatype implements these two functions in utils/TypePrinter.h
+
+A quick example, in <a href="https://github.com/ARM-software/ComputeLibrary/blob/master/arm_compute/core/Types.h">Types.h</a> we add:
+
+@snippet arm_compute/core/Types.h DataLayout enum definition
+
+And for printing:
+
+@snippet utils/TypePrinter.h Print DataLayout type
+
+In the ACL library, we use namespaces to group all the operators, functions, classes and interfaces. The main namespace to use is arm_compute. In the test suite, the test framework and the individual tests use nested namespaces like @ref test::validation or @ref test::benchmark to group the different purposes of various parts of the suite.
+Utility functions like conversion or type cast operators, that are shared by multiple operators are in arm_compute/core/Utils.h. Non-inlined function definitions go in the corresponding .cpp files in the src folder.
+Similarly, all common functions that process shapes, like calculating output shapes of an operator or shape conversions etc are in arm_compute/core/utils/misc/ShapeCalculator.h.
+
+
+@subsection S4_1_2_add_kernel Add a kernel
+As we mentioned at the beginning, the kernel is the implementation of the operator or algorithm partially using a specific programming language related to the backend we want to use. Adding a kernel in the library means implementing the algorithm in a SIMD technology like NEON or OpenCL. All kernels in ACL must implement a common interface IKernel or one of the specific subinterfaces.
+IKernel is the common interface for all the kernels in the core library, it contains the main methods for configure and run the kernel itself, such as window()  that return the maximum window the kernel can be executed on or is_parallelisable() for indicate whether or not the kernel is parallelizable. If the kernel is parallelizable then the window returned by the window() method can be split into sub-windows which can then be run in parallel, in the other case, only the window returned by window() can be passed to the run method.
+There are specific interfaces for OpenCL and Neon: @ref ICLKernel, INEKernel (using INEKernel = @ref ICPPKernel).
+
+- @ref ICLKernel is the common interface for all the OpenCL kernels. It implements the inherited methods and adds all the methods necessary to configure the CL kernel, such as set/return the Local-Workgroup-Size hint, add single, array or tensor argument, set the targeted GPU architecture according to the CL device. All these methods are used during the configuration and the run of the operator.
+- INEKernel inherits from @ref IKernel as well and it's the common interface for all kernels implemented in NEON, it adds just the run and the name methods.
+
+There are two others implementation of @ref IKernel called @ref ICLSimpleKernel and INESimpleKernel, they are the interface for simple kernels that have just one input tensor and one output tensor.
+Creating a new kernel implies adding new files:
+- arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
+- src/core/CL/cl_kernels/reshape_layer.cl
+- src/core/CL/kernels/CLReshapeLayerKernel.cpp
+- src/core/CL/CLKernelLibrary.cpp
+
+Neon kernel
+- arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
+- src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+
+We must register the new layer in the respective libraries:
+- arm_compute/core/CL/CLKernels.h
+- arm_compute/core/NEON/NEKernels.h
+
+These files contain the list of all kernels available in the corresponding ACL's backend, for example CLKernels:
+@code{.cpp}
+... 
+#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+... 
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+... 
+
+@endcode
+
+For OpenCL we need to update the CLKernelLibrary.cpp, adding the appropriate code to embed the .cl kernel in the library. The OpenCL code can be compiled offline and embed in the library as binary.
+The essential operation we want to do with a kernel will be
+- create the kernel object
+- initialize the kernel with the input/output and any other parameters that may be required
+- retrieve the execution window of the kernel and run the whole kernel window in the current thread or use the multithreading.
+
+Each kernel will have to implement the method:
+- %validate: is a static function that checks if the given info will lead to a valid configuration of the kernel.
+- configure: configure the kernel, its window, accessor, valid region, etc for the given set of tensors and other parameters.
+- run: execute the kernel in the window
+
+The structure of the kernel .cpp file should be similar to the next ones.
+For OpenCL:
+@snippet src/core/CL/kernels/CLReshapeLayerKernel.cpp CLReshapeLayerKernel Kernel
+The run will call the function defined in the .cl file.
+
+For the NEON backend case:
+@snippet src/core/NEON/kernels/NEReshapeLayerKernel.cpp NEReshapeLayerKernel Kernel
+
+In the NEON case, there is no need to add an extra file and we implement the kernel in the same NEReshapeLayerKernel.cpp file.
+If the tests are already in place, the new kernel can be tested using the existing tests by adding the configure and run of the kernel to the compute_target() in the fixture.
+
+
+@subsection S4_1_3_add_function Add a function
+
+%Memory management and scheduling the underlying kernel(s) must be handled by the function implementation. A kernel class must support window() API which return the execute window for the configuration that the kernel is configured for. A window specifies the dimensions of a workload. It has a start and end on each of the dimension. A maximum of Coordinates::num_max_dimensions is supported. The run time layer is expected to query the kernel for the window size and schedule the window as it sees fit. It could choose to split the window into sub windows so that it could be run in parallel. The split must adhere to the following rules
+
+- max[n].start() <= sub[n].start() < max[n].end()
+- sub[n].start() < sub[n].end() <= max[n].end()
+- max[n].step() == sub[n].step()
+- (sub[n].start() - max[n].start()) % max[n].step() == 0
+- (sub[n].end() - sub[n].start()) % max[n].step() == 0
+
+@ref CPPScheduler::schedule provides a sample implementation that is used for NEON kernels.
+%Memory management is the other aspect that the runtime layer is supposed to handle. %Memory management of the tensors is abstracted using TensorAllocator. Each tensor holds a pointer to a TensorAllocator object, which is used to allocate and free the memory at runtime. The implementation that is currently supported in ACL allows memory blocks, required to be fulfilled for a given operator, to be grouped together under a @ref MemoryGroup. Each group can be acquired and released. The underlying implementation of memory groups vary depending on whether NEON or CL is used. The memory group class uses memory pool to provide the required memory. It also uses the memory manager to manage the lifetime and a IPoolManager to manage the memory pools registered with the memory manager.
+
+
+We have seen the various interfaces for a kernel in the core library, the same structure the same file structure design exists in the runtime module. IFunction is the base class for all the functions, it has two child interfaces: ICLSimpleFunction and INESimpleFunction that are used as base class for functions which call a single kernel.
+
+The new operator has to implement %validate(), configure() and run(), these methods will call the respective function in the kernel considering that the multi-threading is used for the kernels which are parallelizable, by default std::thread::hardware_concurrency() threads are used. For NEON function can be used CPPScheduler::set_num_threads() to manually set the number of threads, whereas for OpenCL kernels all the kernels are enqueued on the queue associated with CLScheduler and the queue is then flushed.
+For the runtime functions, there is an extra method implemented: prepare(), this method prepares the function for the run, it does all the heavy operations that are done only once (reshape the weight, release the memory not necessary after the reshape, etc). The prepare method can be called standalone or in the first run, if not called before, after then the function will be marked as prepared.
+The files we add are:
+
+OpenCL function
+- arm_compute/runtime/CL/functions/CLReshapeLayer.h
+- src/runtime/CL/functions/CLReshapeLayer.cpp
+
+Neon function
+- arm_compute/runtime/NEON/functions/NEReshapeLayer.h
+- src/runtime/NEON/functions/NEReshapeLayer.cpp
+
+As we did in the kernel we have to edit the runtime libraries to register the new operator modifying the relative library file:
+- arm_compute/runtime/CL/CLFunctions.h
+- arm_compute/runtime/NEON/NEFunctions.h
+
+For the special case where the new function calls only one kernel, we could use as base class ICLSimpleFunction or INESimpleFunction. The configure and the validate methods will simply call the corresponding functions. The structure will be:
+@snippet src/runtime/CL/functions/CLReshapeLayer.cpp CLReshapeLayer snippet
+
+
+If the function is more complicated and calls more than one kernel we have to use the memory manager to manage the intermediate tensors; in the configure() method we call the manage() function passing the tensor to keep track, in the run method we will have to acquire all the buffer managed and released at the end.
+For OpenCL if we want to add two tensor input and reshape the result:
+
+@code{.cpp}
+using namespace arm_compute;
+
+CLAddReshapeLayer:: CLAddReshapeLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager))
+{
+}
+
+void CLAddReshapeLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    // Allocate memory
+    TensorInfo info();
+    add_output.allocator()->init(info);
+
+    // Manage intermediate buffers
+    memory_group.manage(&_addOutput);
+
+    // Initialise kernel
+    _add_kernel.configure(input1, input2, &add_output);
+    _reshape_kernel.configure(&add_output, output);
+
+    // Allocate intermediate tensors
+    add_output.allocator()->allocate();
+}
+
+Status CLAddReshapeLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    TensorInfo add_output();
+    ARM_COMPUTE_RETURN_ERROR_ON(CLAddLayerKernel::validate(input1, input2, add_output));
+    ARM_COMPUTE_RETURN_ERROR_ON(CLReshapeLayerKernel::validate(add_output, output));
+    return Status{};
+}
+
+void CLAddReshapeLayer::run()
+{
+    memory_group.acquire();
+
+    // Run Add
+    add_kernel.run();
+
+    // Run Reshape
+    CLScheduler::get().enqueue(reshape_kernel);
+
+    memory_group.release();
+}
+
+@endcode
+
+For NEON:
+
+@code{.cpp}
+using namespace arm_compute;
+
+NEAddReshapeLayer:: NEAddReshapeLayer (std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager))
+{
+}
+
+void NEAddReshapeLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    // Allocate memory
+    TensorInfo info();
+    add_output.allocator()->init(info);
+
+    // Manage intermediate buffers
+    memory_group.manage(&_addOutput);
+
+    // Initialise kernel
+    add_kernel.configure(input1, input2, &addOutput);
+    reshape_kernel.configure(&addOutput, output);
+
+    // Allocate intermediate tensors
+    add_output.allocator()->allocate();
+}
+
+void NEAddReshapeLayer::run()
+{
+    memory_group.acquire();
+
+    // Run Add
+    add_kernel.run();
+
+    // Run Reshape
+    NEScheduler::get().schedule(_reshape_kernel.get(), Window::DimY);
+
+    memory_group.release();
+}
+@endcode
+
+
+At this point, everything is in place at the library level. If you are following an tests driven implementation and all the tests are already in place, we can call the function configuration in the fixture and remove any redundant code like the allocation of the intermediate tensors since it's done in the function. Run the final tests to check the results match with the expected results from the reference implementation.
+
+@subsection S4_1_4_add_validation Add validation artifacts
+
+@subsubsection S4_1_4_1_add_reference Add the reference implementation and the tests
+As mentioned in the introduction, the reference implementation is a pure C++ implementation without any optimization or backend specific instruction.
+The refence implementation consist of two files into the folder tests/validation/reference:
+- tests/validation/reference/ReshapeLayer.h
+- tests/validation/reference/ReshapeLayer.cpp
+
+where we will put respectively the declaration and definition of the new operator.
+All the utility functions that are used ONLY in the tests are in test/validation/helpers.h, for all the others, as mentioned before, there are helpers in the library.
+ACL and the tests do use templates, the reference implementation is a generic implementation independent from the datatype and we use the templates to generalize the datatype concept.
+Following the example, let's have a look at the ReshapeLayer operator:
+
+- tests/validation/reference/ReshapeLayer.h
+
+@snippet tests/validation/reference/ReshapeLayer.h ReshapeLayer
+
+- tests/validation/reference/ReshapeLayer.cpp
+
+@snippet tests/validation/reference/ReshapeLayer.cpp ReshapeLayer
+
+An explicit instantiation of the template for the required datatypes must be added in the .cpp file.
+
+@subsubsection S4_1_4_2_add_dataset Add dataset
+One of the parameters of the tests is the dataset, it will be used to generate versions of the test case with different inputs.
+To pass the dataset at the fixture data test case we have three cases
+- the operator dataset is simple so it can be added directly in the test case data declaration
+- we can create a class that return tuples at the test framework
+
+@snippet tests/datasets/PoolingTypesDataset.h PoolingTypes datasets
+
+- if we want to create dynamically the dataset combining different parameter, we can create the dataset using iterators.
+For example, dataset for ReshapeLayer:
+
+@snippet tests/datasets/ReshapeLayerDataset.h ReshapeLayer datasets
+
+@subsubsection S4_1_4_3_add_fixture  Add a fixture and a data test case
+
+Benchmark and validation tests are based on the same framework to setup and run the tests. In addition to running simple, self-contained test functions the framework supports fixtures and data test cases.
+Fixtures can be used to share common setup, teardown or even run tasks among multiple test cases, for that purpose a fixture can define a "setup", "teardown" and "run" method.
+Adding tests for the new operator in the runtime library we need to implement at least the setup method, that is used to call two methods for configure, run and return the output respectively of the target (CL or Neon) and the reference (C++ implementation).
+
+For example let's have a look at Reshape Layer Fixture :
+
+@snippet tests/validation/fixtures/ReshapeLayerFixture.h ReshapeLayer fixture
+
+In the fixture class above we can see that the setup method computes the target and reference and store them in the two members _target and _reference which will be used later to check for correctness.
+The compute_target method reflects the exact behavior expected when we call a function. The input and output tensor must be declared, function configured, tensors allocated, the input tensor filled with required data, and finally, the function must be run and the results returned.
+This fixture is used in the test case, that is a parameterized test case that inherits from a fixture. The test case will have access to all public and protected members of the fixture. Only the setup and teardown methods of the fixture will be used. The setup method of the fixture needs to be a template and must accept inputs from the dataset as arguments.
+The body of this function will be used as a test function.
+For the fixture test case the first argument is the name of the test case (has to be unique within the enclosing test suite), the second argument is the class name of the fixture, the third argument is the dataset mode in which the test will be active (PRECOMMIT or NIGTHLY) and the fourth argument is the dataset.
+For example:
+
+@snippet tests/validation/CL/ActivationLayer.cpp CLActivationLayerFixture snippet
+
+@code{.cpp}
+TEST_SUITE(CL)
+TEST_SUITE(ActivationLayer)
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+@endcode
+@snippet tests/validation/CL/ActivationLayer.cpp CLActivationLayer Test snippet
+@code{.cpp}
+TEST_SUITE_END()
+TEST_SUITE_END()
+TEST_SUITE_END()
+TEST_SUITE_END()
+@endcode
+
+This will produce a set of tests that can be filtered with "CL/ReshapeLayer/Float/FP16/RunSmall". Each test produced from the cartesian product of the dataset is associated to a number and can be filtered specifying all the parameters.
+*/
+} // namespace arm_compute

diff --git a/docs/05_functions_list.dox b/docs/05_functions_list.dox
new file mode 100644
index 0000000..9848307
--- /dev/null
+++ b/docs/05_functions_list.dox

@@ -0,0 +1,300 @@
+///
+/// Copyright (c) 2018 ARM Limited.
+///
+/// SPDX-License-Identifier: MIT
+///
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to
+/// deal in the Software without restriction, including without limitation the
+/// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+/// sell copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+///
+/// The above copyright notice and this permission notice shall be included in all
+/// copies or substantial portions of the Software.
+///
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
+///
+namespace arm_compute
+{
+/**
+
+@page functions_list List of functions
+
+@tableofcontents
+
+@section S5_1 NEON functions
+
+- @ref IFunction
+    - @ref INESimpleFunction
+        - @ref NEAbsoluteDifference
+        - @ref NEAccumulate
+        - @ref NEAccumulateSquared
+        - @ref NEAccumulateWeighted
+        - @ref NEActivationLayer
+        - @ref NEArithmeticAddition
+        - @ref NEArithmeticSubtraction
+        - @ref NEBitwiseAnd
+        - @ref NEBitwiseNot
+        - @ref NEBitwiseOr
+        - @ref NEBitwiseXor
+        - @ref NEBox3x3
+        - @ref NEChannelCombine
+        - @ref NEChannelExtract
+        - @ref NEChannelShuffleLayer
+        - @ref NECol2Im
+        - @ref NEColorConvert
+        - @ref NEConvolution3x3
+        - @ref NEConvolutionRectangle
+        - @ref NECopy
+        - @ref NEDepthConvertLayer
+        - @ref NEDilate
+        - @ref NEErode
+        - @ref NEFlattenLayer
+        - @ref NEFloor
+        - @ref NEFullyConnectedLayerReshapeWeights
+        - @ref NEGEMMInterleave4x4
+        - @ref NEGEMMLowpQuantizeDownInt32ToUint8Scale
+        - @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
+        - @ref NEGEMMTranspose1xW
+        - @ref NEGaussian3x3
+        - @ref NEHOGDetector
+        - @ref NEIntegralImage
+        - @ref NEMagnitude
+        - @ref NEMedian3x3
+        - @ref NENonLinearFilter
+        - @ref NENonMaximaSuppression3x3
+        - @ref NEPermute
+        - @ref NEPhase
+        - @ref NEPixelWiseMultiplication
+        - @ref NEPriorBoxLayer
+        - @ref NERemap
+        - @ref NEReorgLayer
+        - @ref NEReshapeLayer
+        - @ref NEScharr3x3
+        - @ref NESobel3x3
+        - @ref NETableLookup
+        - @ref NEThreshold
+        - @ref NETranspose
+        - @ref NEWarpAffine
+        - @ref NEWarpPerspective
+        - @ref NEYOLOLayer
+    - @ref NEBatchNormalizationLayer
+    - @ref NECannyEdge
+    - @ref NEConcatenateLayer
+    - @ref NEConvertFullyConnectedWeights
+    - @ref NEConvolutionLayer
+    - @ref NEConvolutionLayerReshapeWeights
+    - @ref NEConvolutionSquare&lt;matrix_size&gt;
+    - @ref NEDeconvolutionLayer
+    - @ref NEDepthConcatenateLayer
+    - @ref NEDepthwiseConvolutionLayer
+    - @ref NEDepthwiseConvolutionLayer3x3
+    - @ref NEDepthwiseSeparableConvolutionLayer
+    - @ref NEDequantizationLayer
+    - @ref NEDerivative
+    - @ref NEDirectConvolutionLayer
+    - @ref NEEqualizeHistogram
+    - @ref NEFastCorners
+    - @ref NEFillBorder
+    - @ref NEFullyConnectedLayer
+    - @ref NEGEMM
+    - @ref NEGEMMAssemblyDispatch
+    - @ref NEGEMMConvolutionLayer
+    - @ref NEGEMMInterleavedWrapper
+    - @ref NEGEMMLowpAssemblyMatrixMultiplyCore
+    - @ref NEGEMMLowpMatrixMultiplyCore
+    - @ref NEGaussian5x5
+    - @ref NEGaussianPyramid
+        - @ref NEGaussianPyramidHalf
+        - @ref NEGaussianPyramidOrb
+    - @ref NEHOGDescriptor
+    - @ref NEHOGGradient
+    - @ref NEHOGMultiDetection
+    - @ref NEHarrisCorners
+    - @ref NEHistogram
+    - @ref NEIm2Col
+    - @ref NEL2NormalizeLayer
+    - @ref NELSTMLayer
+    - @ref NELaplacianPyramid
+    - @ref NELaplacianReconstruct
+    - @ref NELocallyConnectedLayer
+    - @ref NEMeanStdDev
+    - @ref NEMinMaxLocation
+    - @ref NENormalizationLayer
+    - @ref NEOpticalFlow
+    - @ref NEPoolingLayer
+    - @ref NEQuantizationLayer
+    - @ref NERNNLayer
+    - @ref NEROIPoolingLayer
+    - @ref NEReduceMean
+    - @ref NEReductionOperation
+    - @ref NEScale
+    - @ref NESimpleAssemblyFunction
+    - @ref NESobel5x5
+    - @ref NESobel7x7
+    - @ref NESoftmaxLayer
+    - @ref NEUpsampleLayer
+    - @ref NEWidthConcatenateLayer
+    - @ref NEWinogradConvolutionLayer
+
+@section S5_2 OpenCL functions
+
+- @ref IFunction
+    - @ref CLBatchNormalizationLayer
+    - @ref CLBatchToSpaceLayer
+    - @ref CLCannyEdge
+    - @ref CLConcatenateLayer
+    - @ref CLConvolutionLayer
+    - @ref CLConvolutionLayerReshapeWeights
+    - @ref CLConvolutionSquare&lt;matrix_size&gt;
+    - @ref CLDeconvolutionLayer
+    - @ref CLDeconvolutionLayerUpsample
+    - @ref CLDepthConcatenateLayer
+    - @ref CLDepthwiseConvolutionLayer
+    - @ref CLDepthwiseConvolutionLayer3x3
+    - @ref CLDepthwiseSeparableConvolutionLayer
+    - @ref CLDequantizationLayer
+    - @ref CLDirectConvolutionLayer
+    - @ref CLEqualizeHistogram
+    - @ref CLFastCorners
+    - @ref CLFullyConnectedLayer
+    - @ref CLFuseBatchNormalization
+    - @ref CLGEMM
+    - @ref CLGEMMConvolutionLayer
+    - @ref CLGEMMLowpMatrixMultiplyCore
+    - @ref CLGaussian5x5
+    - @ref CLGaussianPyramid
+        - @ref CLGaussianPyramidHalf
+        - @ref CLGaussianPyramidOrb
+    - @ref CLGenerateProposalsLayer
+    - @ref CLHOGDescriptor
+    - @ref CLHOGDetector
+    - @ref CLHOGGradient
+    - @ref CLHOGMultiDetection
+    - @ref CLHarrisCorners
+    - @ref CLHistogram
+    - @ref CLIntegralImage
+    - @ref CLL2NormalizeLayer
+    - @ref CLLSTMLayer
+    - @ref CLLaplacianPyramid
+    - @ref CLLaplacianReconstruct
+    - @ref CLLocallyConnectedLayer
+    - @ref CLMeanStdDev
+    - @ref CLMinMaxLocation
+    - @ref CLNormalizationLayer
+    - @ref CLNormalizePlanarYUVLayer
+    - @ref CLOpticalFlow
+    - @ref CLPadLayer
+    - @ref CLQuantizationLayer
+    - @ref CLRNNLayer
+    - @ref CLReduceMean
+    - @ref CLReductionOperation
+    - @ref CLSobel5x5
+    - @ref CLSobel7x7
+    - @ref CLSoftmaxLayer
+    - @ref CLSpaceToBatchLayer
+    - @ref CLSplit
+    - @ref CLUpsampleLayer
+    - @ref CLWidthConcatenateLayer
+    - @ref CLWinogradConvolutionLayer
+    - @ref ICLSimpleFunction
+        - @ref CLAbsoluteDifference
+        - @ref CLAccumulate
+        - @ref CLAccumulateSquared
+        - @ref CLAccumulateWeighted
+        - @ref CLActivationLayer
+        - @ref CLArithmeticAddition
+        - @ref CLArithmeticDivision
+        - @ref CLArithmeticSubtraction
+        - @ref CLBitwiseAnd
+        - @ref CLBitwiseNot
+        - @ref CLBitwiseOr
+        - @ref CLBitwiseXor
+        - @ref CLBoundingBoxTransform
+        - @ref CLBox3x3
+        - @ref CLChannelCombine
+        - @ref CLChannelExtract
+        - @ref CLChannelShuffleLayer
+        - @ref CLColorConvert
+        - @ref CLConvertFullyConnectedWeights
+        - @ref CLConvolution3x3
+        - @ref CLConvolutionRectangle
+        - @ref CLCopy
+        - @ref CLDepthConvertLayer
+        - @ref CLDerivative
+        - @ref CLDilate
+        - @ref CLErode
+        - @ref CLFillBorder
+        - @ref CLFlattenLayer
+        - @ref CLFloor
+        - @ref CLFullyConnectedLayerReshapeWeights
+        - @ref CLGEMMInterleave4x4
+        - @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale
+        - @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
+        - @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat
+        - @ref CLGEMMTranspose1xW
+        - @ref CLGaussian3x3
+        - @ref CLMagnitude
+        - @ref CLMedian3x3
+        - @ref CLNonLinearFilter
+        - @ref CLNonMaximaSuppression3x3
+        - @ref CLPermute
+        - @ref CLPhase
+        - @ref CLPixelWiseMultiplication
+        - @ref CLPoolingLayer
+        - @ref CLPriorBoxLayer
+        - @ref CLROIAlignLayer
+        - @ref CLROIPoolingLayer
+        - @ref CLRemap
+        - @ref CLReorgLayer
+        - @ref CLReshapeLayer
+        - @ref CLScale
+        - @ref CLScharr3x3
+        - @ref CLSlice
+        - @ref CLSobel3x3
+        - @ref CLStridedSlice
+        - @ref CLTableLookup
+        - @ref CLThreshold
+        - @ref CLTranspose
+        - @ref CLWarpAffine
+        - @ref CLWarpPerspective
+        - @ref CLWinogradInputTransform
+        - @ref CLYOLOLayer
+
+@section S5_3 GLES Compute functions
+
+- @ref IFunction
+    - @ref GCBatchNormalizationLayer
+    - @ref GCConvolutionLayer
+    - @ref GCConvolutionLayerReshapeWeights
+    - @ref GCDepthConcatenateLayer
+    - @ref GCDepthwiseConvolutionLayer3x3
+    - @ref GCDirectConvolutionLayer
+    - @ref GCDropoutLayer
+    - @ref GCFullyConnectedLayer
+    - @ref GCGEMM
+    - @ref GCNormalizationLayer
+    - @ref GCNormalizePlanarYUVLayer
+    - @ref GCPoolingLayer
+    - @ref GCSoftmaxLayer
+    - @ref IGCSimpleFunction
+        - @ref GCAbsoluteDifference
+        - @ref GCActivationLayer
+        - @ref GCArithmeticAddition
+        - @ref GCFillBorder
+        - @ref GCFullyConnectedLayerReshapeWeights
+        - @ref GCGEMMInterleave4x4
+        - @ref GCGEMMTranspose1xW
+        - @ref GCPixelWiseMultiplication
+        - @ref GCScale
+        - @ref GCTensorShift
+        - @ref GCTranspose
+*/
+} // namespace

diff --git a/docs/Doxyfile b/docs/Doxyfile
index 67cdc00..d2d728d 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile

@@ -38,7 +38,7 @@
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 18.08
+PROJECT_NUMBER         = 18.11
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -769,11 +769,13 @@
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = ./docs/00_introduction.dox \
+                         ./docs/05_functions_list.dox \
                          ./docs/01_library.dox \
+                         ./docs/04_adding_operator.dox \
                          ./docs/02_tests.dox \
                          ./docs/03_scripts.dox \
                          ./arm_compute/ \
-                         ./src/core/CL/cl_kernels/ \
+                         ./src/ \
                          ./examples/ \
                          ./tests/ \
                          ./utils/ \
@@ -854,8 +856,13 @@
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = ./arm_compute/core/NEON/kernels/assembly/ \ 
+EXCLUDE                = ./arm_compute/core/NEON/kernels/assembly/ \
                          ./arm_compute/core/NEON/kernels/convolution/ \
+                         ./src/core/NEON/kernels/assembly/ \
+                         ./src/core/NEON/kernels/convolution/ \
+                         ./src/core/NEON/kernels/NELKTrackerKernel.cpp \
+                         ./src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp \
+                         ./src/core/GLES_COMPUTE/cs_shaders/ \
                          ./tests/datasets/ \
                          ./tests/benchmark/fixtures/ \
                          ./tests/validation/fixtures/
commit	b9abeae0897bef74553ba9800c4ff5f74131c5b4	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Thu Nov 22 11:58:08 2018 +0000
committer	Jenkins <bsgcomp@arm.com>	Thu Nov 22 11:58:08 2018 +0000
tree	69f512b9d2d7d9169960592cca948dc6e8e5b936
parent	52ba29e936b8e711e8acdfe819e36f884d4f3fe1 [diff]