arm_compute v17.09 Change-Id: I4bf8f4e6e5f84ce0d5b6f5ba570d276879f42a81

commit: 8938bd3f40ea62ff56d6ed4e2db0a8aee34dd64a [log] [tgz]
author: Kaizen <kaizen@arm.com> Thu Sep 28 14:38:23 2017 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Thu Sep 28 16:31:13 2017 +0100
tree: c234331232f227e0cdfb567a54ecaa5460aaa064
parent: f4a254c2745aeaab6f7276a675147d707002fe7a [diff]
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 1fb94ed..2b6ddfb 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox

@@ -36,33 +36,50 @@
 	├── arm_compute --> All the arm_compute headers
 	│   ├── core
 	│   │   ├── CL
+	│   │   │   ├── CLKernelLibrary.h --> Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
 	│   │   │   ├── CLKernels.h --> Includes all the OpenCL kernels at once
 	│   │   │   ├── CL specialisation of all the generic objects interfaces (ICLTensor, ICLImage, etc.)
 	│   │   │   ├── kernels --> Folder containing all the OpenCL kernels
 	│   │   │   │   └── CL*Kernel.h
 	│   │   │   └── OpenCL.h --> Wrapper to configure the Khronos OpenCL C++ header
 	│   │   ├── CPP
+	│   │   │   ├── CPPKernels.h --> Includes all the CPP kernels at once
 	│   │   │   └── kernels --> Folder containing all the CPP kernels
-	│   │   │   │   └── CPP*Kernel.h
+	│   │   │       └── CPP*Kernel.h
 	│   │   ├── NEON
 	│   │   │   ├── kernels --> Folder containing all the NEON kernels
+	│   │   │   │   ├── arm64 --> Folder containing the interfaces for the assembly arm64 NEON kernels
+	│   │   │   │   ├── arm32 --> Folder containing the interfaces for the assembly arm32 NEON kernels
+	│   │   │   │   ├── assembly --> Folder containing the NEON assembly routines.
 	│   │   │   │   └── NE*Kernel.h
 	│   │   │   └── NEKernels.h --> Includes all the NEON kernels at once
 	│   │   ├── All common basic types (Types.h, Window, Coordinates, Iterator, etc.)
 	│   │   ├── All generic objects interfaces (ITensor, IImage, etc.)
 	│   │   └── Objects metadata classes (ImageInfo, TensorInfo, MultiImageInfo)
+	│   ├── graph
+	│   │   ├── CL --> OpenCL specific operations
+	│   │   │   └── CLMap.h / CLUnmap.h
+	│   │   ├── nodes
+	│   │   │   └── The various nodes supported by the graph API
+	│   │   ├── Nodes.h --> Includes all the Graph nodes at once.
+	│   │   └── Graph objects ( INode, ITensorAccessor, Graph, etc.)
 	│   └── runtime
 	│       ├── CL
 	│       │   ├── CL objects & allocators (CLArray, CLImage, CLTensor, etc.)
 	│       │   ├── functions --> Folder containing all the OpenCL functions
 	│       │   │   └── CL*.h
+	│       │   ├── CLScheduler.h --> Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
 	│       │   └── CLFunctions.h --> Includes all the OpenCL functions at once
 	│       ├── CPP
-	│       │   └── Scheduler.h --> Basic pool of threads to execute CPP/NEON code on several cores in parallel
+	│       │   ├── CPPKernels.h --> Includes all the CPP functions at once.
+	│       │   └── CPPScheduler.h --> Basic pool of threads to execute CPP/NEON code on several cores in parallel
 	│       ├── NEON
 	│       │   ├── functions --> Folder containing all the NEON functions
 	│       │   │   └── NE*.h
 	│       │   └── NEFunctions.h --> Includes all the NEON functions at once
+	│       ├── OMP
+	│       │   └── OMPScheduler.h --> OpenMP scheduler (Alternative to the CPPScheduler)
+	│       ├── Memory manager files (LifetimeManager, PoolManager, etc.)
 	│       └── Basic implementations of the generic object interfaces (Array, Image, Tensor, etc.)
 	├── documentation
 	│   ├── index.xhtml
@@ -70,36 +87,55 @@
 	├── documentation.xhtml -> documentation/index.xhtml
 	├── examples
 	│   ├── cl_convolution.cpp
+	│   ├── cl_events.cpp
+	│   ├── graph_lenet.cpp
 	│   ├── neoncl_scale_median_gaussian.cpp
+	│   ├── neon_cnn.cpp
+	│   ├── neon_copy_objects.cpp
 	│   ├── neon_convolution.cpp
 	│   └── neon_scale.cpp
 	├── include
-	│   └── CL
-	│       └── Khronos OpenCL C headers and C++ wrapper
+	│   ├── CL
+	│   │   └── Khronos OpenCL C headers and C++ wrapper
+	│   ├── half --> FP16 library available from http://half.sourceforge.net
+	│   └── libnpy --> Library to load / write npy buffers, available from https://github.com/llohse/libnpy
 	├── opencl-1.2-stubs
 	│   └── opencl_stubs.c
+	├── scripts
+	│   ├── caffe_data_extractor.py --> Basic script to export weights from Caffe to npy files
+	│   └── tensorflow_data_extractor.py --> Basic script to export weights from Tensor Flow to npy files
 	├── src
 	│   ├── core
 	│   │   └── ... (Same structure as headers)
 	│   │       └── CL
 	│   │           └── cl_kernels --> All the OpenCL kernels
+	│   ├── graph
+	│   │   └── ... (Same structure as headers)
 	│   └── runtime
 	│       └── ... (Same structure as headers)
+	├── support
+	│   └── Various headers to work around toolchains / platform issues.
 	├── tests
 	│   ├── All test related files shared between validation and benchmark
-	│   ├── CL --> OpenCL specific files (shared)
-	│   ├── NEON --> NEON specific files (shared)
+	│   ├── CL --> OpenCL accessors
+	│   ├── NEON --> NEON accessors
 	│   ├── benchmark --> Sources for benchmarking
 	│   │   ├── Benchmark specific files
-	│   │   ├── main.cpp --> Entry point for benchmark test framework
 	│   │   ├── CL --> OpenCL benchmarking tests
 	│   │   └── NEON --> NEON benchmarking tests
+	│   ├── datasets
+	│   │   └── Datasets for all the validation / benchmark tests, layer configurations for various networks, etc.
+	│   ├── framework
+	│   │   └── Boiler plate code for both validation and benchmark test suites (Command line parsers, instruments, output loggers, etc.)
+	│   ├── networks
+	│   │   └── Examples of how to instantiate networks.
 	│   ├── validation --> Sources for validation
 	│   │   ├── Validation specific files
-	│   │   ├── main.cpp --> Entry point for validation test framework
 	│   │   ├── CL --> OpenCL validation tests
-	│   │   ├── NEON --> NEON validation tests
-	│   │   └── UNIT --> Library validation tests
+	│   │   ├── CPP --> C++ reference implementations
+	│   │   ├── fixtures
+	│   │   │   └── Fixtures to initialise and run the runtime Functions.
+	│   │   └── NEON --> NEON validation tests
 	│   └── dataset --> Datasets defining common sets of input parameters
 	└── utils --> Boiler plate code used by examples
 	    └── Utils.h
@@ -119,6 +155,35 @@
 
 @subsection S2_2_changelog Changelog
 
+v17.09 Public major release
+ - Experimental Graph support: initial implementation of a simple stream API to easily chain machine learning layers.
+ - Memory Manager (@ref arm_compute::BlobLifetimeManager, @ref arm_compute::BlobMemoryPool, @ref arm_compute::ILifetimeManager, @ref arm_compute::IMemoryGroup, @ref arm_compute::IMemoryManager, @ref arm_compute::IMemoryPool, @ref arm_compute::IPoolManager, @ref arm_compute::MemoryManagerOnDemand, @ref arm_compute::PoolManager)
+ - New validation and benchmark frameworks (Boost and Google frameworks replaced by homemade framework).
+ - Most machine learning functions support both fixed point 8 and 16 bit (QS8, QS16) for both NEON and OpenCL.
+ - New NEON kernels / functions:
+    - @ref arm_compute::NEGEMMAssemblyBaseKernel @ref arm_compute::NEGEMMAArch64Kernel
+    - @ref arm_compute::NEDequantizationLayerKernel / @ref arm_compute::NEDequantizationLayer
+    - @ref arm_compute::NEFloorKernel / @ref arm_compute::NEFloor
+    - @ref arm_compute::NEL2NormalizeKernel / @ref arm_compute::NEL2Normalize
+    - @ref arm_compute::NEQuantizationLayerKernel @ref arm_compute::NEMinMaxLayerKernel / @ref arm_compute::NEQuantizationLayer
+    - @ref arm_compute::NEROIPoolingLayerKernel / @ref arm_compute::NEROIPoolingLayer
+    - @ref arm_compute::NEReductionOperationKernel / @ref arm_compute::NEReductionOperation
+    - @ref arm_compute::NEReshapeLayerKernel / @ref arm_compute::NEReshapeLayer
+
+ - New OpenCL kernels / functions:
+    - @ref arm_compute::CLDepthwiseConvolution3x3Kernel @ref arm_compute::CLDepthwiseIm2ColKernel @ref arm_compute::CLDepthwiseVectorToTensorKernel @ref arm_compute::CLDepthwiseWeightsReshapeKernel / @ref arm_compute::CLDepthwiseConvolution3x3 @ref arm_compute::CLDepthwiseConvolution @ref arm_compute::CLDepthwiseSeparableConvolutionLayer
+    - @ref arm_compute::CLDequantizationLayerKernel / @ref arm_compute::CLDequantizationLayer
+    - @ref arm_compute::CLDirectConvolutionLayerKernel / @ref arm_compute::CLDirectConvolutionLayer
+    - @ref arm_compute::CLFlattenLayer
+    - @ref arm_compute::CLFloorKernel / @ref arm_compute::CLFloor
+    - @ref arm_compute::CLGEMMTranspose1xW
+    - @ref arm_compute::CLGEMMMatrixVectorMultiplyKernel
+    - @ref arm_compute::CLL2NormalizeKernel / @ref arm_compute::CLL2Normalize
+    - @ref arm_compute::CLQuantizationLayerKernel @ref arm_compute::CLMinMaxLayerKernel / @ref arm_compute::CLQuantizationLayer
+    - @ref arm_compute::CLROIPoolingLayerKernel / @ref arm_compute::CLROIPoolingLayer
+    - @ref arm_compute::CLReductionOperationKernel / @ref arm_compute::CLReductionOperation
+    - @ref arm_compute::CLReshapeLayerKernel / @ref arm_compute::CLReshapeLayer
+
 v17.06 Public major release
  - Various bug fixes
  - Added support for fixed point 8 bit (QS8) to the various NEON machine learning kernels.
@@ -172,7 +237,6 @@
  -  @ref arm_compute::NENonMaximaSuppression3x3FP16Kernel
  -  @ref arm_compute::NENonMaximaSuppression3x3Kernel
 
-
 v17.03.1 First Major public release of the sources
  - Renamed the library to arm_compute
  - New CPP target introduced for C++ kernels shared between NEON and CL functions.
@@ -205,7 +269,7 @@
  - New OpenCL kernels / functions:
    - @ref arm_compute::CLLogits1DMaxKernel, @ref arm_compute::CLLogits1DShiftExpSumKernel, @ref arm_compute::CLLogits1DNormKernel / @ref arm_compute::CLSoftmaxLayer
    - @ref arm_compute::CLPoolingLayerKernel / @ref arm_compute::CLPoolingLayer
-   - @ref arm_compute::CLIm2ColKernel, @ref arm_compute::CLCol2ImKernel, @ref arm_compute::CLConvolutionLayerWeightsReshapeKernel / @ref arm_compute::CLConvolutionLayer
+   - @ref arm_compute::CLIm2ColKernel, @ref arm_compute::CLCol2ImKernel, arm_compute::CLConvolutionLayerWeightsReshapeKernel / @ref arm_compute::CLConvolutionLayer
    - @ref arm_compute::CLRemapKernel / @ref arm_compute::CLRemap
    - @ref arm_compute::CLGaussianPyramidHorKernel, @ref arm_compute::CLGaussianPyramidVertKernel / @ref arm_compute::CLGaussianPyramid, @ref arm_compute::CLGaussianPyramidHalf, @ref arm_compute::CLGaussianPyramidOrb
    - @ref arm_compute::CLMinMaxKernel, @ref arm_compute::CLMinMaxLocationKernel / @ref arm_compute::CLMinMaxLocation
@@ -303,6 +367,10 @@
 		default: False
 		actual: False
 
+	mali: Enable Mali hardware counters (yes|no)
+		default: False
+		actual: False
+
 	validation_tests: Build validation test programs (yes|no)
 		default: False
 		actual: False
@@ -349,13 +417,11 @@
 
 @b validation_tests: Enable the build of the validation suite.
 
-@note You will need the Boost Test and Program options headers and libraries to build the validation tests. See @ref building_boost for more information.
-
 @b benchmark_tests: Enable the build of the benchmark tests
 
 @b pmu: Enable the PMU cycle counter to measure execution time in benchmark tests. (Your device needs to support it)
 
-@note You will need the Boost Program options and Google Benchmark headers and libraries to build the benchmark tests. See @ref building_google_benchmark for more information.
+@b mali: Enable the collection of Mali hardware counters to measure execution time in benchmark tests. (Your device needs to have a Mali driver that supports it)
 
 @b openmp Build in the OpenMP scheduler for NEON.
 
@@ -365,7 +431,7 @@
 
 @sa arm_compute::Scheduler::set
 
-@subsection S3_2_linux Linux
+@subsection S3_2_linux Building for Linux
 
 @subsubsection S3_2_1_library How to build the library ?
 
@@ -424,11 +490,11 @@
 
 To cross compile an OpenCL example for Linux 32bit:
 
-	arm-linux-gnueabihf-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -mfpu=neon -L. -larm_compute -lOpenCL -o cl_convolution
+	arm-linux-gnueabihf-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -mfpu=neon -L. -larm_compute -lOpenCL -o cl_convolution -DARM_COMPUTE_CL
 
 To cross compile an OpenCL example for Linux 64bit:
 
-	aarch64-linux-gnu-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -L. -larm_compute -lOpenCL -o cl_convolution
+	aarch64-linux-gnu-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -L. -larm_compute -lOpenCL -o cl_convolution -DARM_COMPUTE_CL
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
 
@@ -444,7 +510,7 @@
 
 To compile natively (i.e directly on an ARM device) for OpenCL for Linux 32bit or Linux 64bit:
 
-	g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute -lOpenCL -o cl_convolution
+	g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute -lOpenCL -o cl_convolution -DARM_COMPUTE_CL
 
 
 @note These two commands assume libarm_compute.so is available in your library path, if not add the path to it using -L
@@ -459,7 +525,7 @@
 
 @note If you built the library with support for both OpenCL and NEON you will need to link against OpenCL even if your application only uses NEON.
 
-@subsection S3_3_android Android
+@subsection S3_3_android Building for Android
 
 For Android, the library was successfully built and tested using Google's standalone toolchains:
  - arm-linux-androideabi-4.9 for armv7a (clang++)
@@ -509,9 +575,9 @@
 To cross compile an OpenCL example:
 
 	#32 bit:
-	arm-linux-androideabi-clang++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -L. -o cl_convolution_arm -static-libstdc++ -pie -lOpenCL
+	arm-linux-androideabi-clang++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -L. -o cl_convolution_arm -static-libstdc++ -pie -lOpenCL -DARM_COMPUTE_CL
 	#64 bit:
-	aarch64-linux-android-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -L. -o cl_convolution_aarch64 -static-libstdc++ -pie -lOpenCL
+	aarch64-linux-android-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -L. -o cl_convolution_aarch64 -static-libstdc++ -pie -lOpenCL -DARM_COMPUTE_CL
 
 @note Due to some issues in older versions of the Mali OpenCL DDK (<= r13p0), we recommend to link arm_compute statically on Android.
 
@@ -537,7 +603,35 @@
 	adb shell /data/local/tmp/neon_convolution_aarch64
 	adb shell /data/local/tmp/cl_convolution_aarch64
 
-@subsection S3_4_cl_stub_library The OpenCL stub library
+@subsection S3_4_windows_host Building on a Windows host system
+
+Using `scons` directly from the Windows command line is known to cause
+problems. The reason seems to be that if `scons` is setup for cross-compilation
+it gets confused about Windows style paths (using backslashes). Thus it is
+recommended to follow one of the options outlined below.
+
+@subsubsection S3_4_1_ubuntu_on_windows Bash on Ubuntu on Windows
+
+The best and easiest option is to use 
+<a href="https://msdn.microsoft.com/en-gb/commandline/wsl/about">Ubuntu on Windows</a>. 
+This feature is still marked as *beta* and thus might not be available.
+However, if it is building the library is as simple as opening a *Bash on
+Ubuntu on Windows* shell and following the general guidelines given above.
+
+@subsubsection S3_4_2_cygwin Cygwin
+
+If the Windows subsystem for Linux is not available <a href="https://www.cygwin.com/">Cygwin</a> 
+can be used to install and run `scons`. In addition to the default packages
+installed by Cygwin `scons` has to be selected in the installer. (`git` might
+also be useful but is not strictly required if you already have got the source
+code of the library.) Linaro provides pre-built versions of 
+<a href="http://releases.linaro.org/components/toolchain/binaries/">GCC cross-compilers</a> 
+that can be used from the Cygwin terminal. When building for Android the
+compiler is included in the Android standalone toolchain. After everything has
+been set up in the Cygwin terminal the general guide on building the library
+can be followed.
+
+@subsection S3_5_cl_stub_library The OpenCL stub library
 
 In the opencl-1.2-stubs folder you will find the sources to build a stub OpenCL library which then can be used to link your application or arm_compute against.
 

diff --git a/docs/01_library.dox b/docs/01_library.dox
index 738579e..c7903ba 100644
--- a/docs/01_library.dox
+++ b/docs/01_library.dox

@@ -1,6 +1,6 @@
 namespace arm_compute
 {
-/** 
+/**
 @page architecture Library architecture
 
 @tableofcontents
@@ -83,7 +83,7 @@
 
 @sa CPPScheduler.
 
-@note Some kernels like for example @ref NEHistogramKernel need some local temporary buffer to perform their calculations. In order to avoid memory corruption between threads, the local buffer must be of size: ```memory_needed_per_thread * num_threads``` and each subwindow must be initialized by calling @ref Window::set_thread_id() with a unique thread_id between 0 and num_threads.
+@note Some kernels like for example @ref NEHistogramKernel need some local temporary buffer to perform their calculations. In order to avoid memory corruption between threads, the local buffer must be of size: ```memory_needed_per_thread * num_threads``` and a unique thread_id between 0 and num_threads must be assigned to the @ref ThreadInfo object passed to the ```run``` function.
 
 @subsection S4_2_4 Functions
 
@@ -246,5 +246,125 @@
 Here are a couple of examples of how to use the iterators to fill / read tensors:
 
 @snippet examples/neon_copy_objects.cpp Copy objects example
+
+@section S4_7_memory_manager MemoryManager
+
+@ref IMemoryManager is a memory managing interface that can be used to reduce the memory requirements of a given pipeline by recycling temporary buffers.
+
+@subsection S4_7_1_memory_manager_components MemoryGroup, MemoryPool and MemoryManager Components
+
+@subsubsection S4_7_1_1_memory_group MemoryGroup
+
+@ref IMemoryGroup defines the memory managing granularity.
+
+MemoryGroup binds a number of objects to a bucket of memory requirements that need to be fulfilled in order for an operation or list of operations to be executed.
+
+Requesting backing memory for a specific group can be done using @ref IMemoryGroup::acquire and releasing the memory back using @ref IMemoryGroup::release.
+
+@note Two types of memory groups are currently implemented:
+- @ref MemoryGroup that manages @ref Tensor objects
+- @ref CLMemoryGroup that manages @ref CLTensor objects.
+
+@subsubsection S4_7_1_2_memory_pool MemoryPool
+
+@ref IMemoryPool defines a pool of memory that can be used to provide backing memory to a memory group.
+
+@note @ref BlobMemoryPool is currently implemented which models the memory requirements as a vector of distinct memory blobs.
+
+@subsubsection S4_7_1_2_memory_manager_components MemoryManager Components
+
+@ref IMemoryManager consists of two components:
+- @ref ILifetimeManager that keeps track of the lifetime of the registered objects of the memory groups and given an @ref IAllocator creates an appropriate memory pool that fulfils the memory requirements of all the registered memory groups.
+- @ref IPoolManager that safely manages the registered memory pools.
+
+@note @ref IMemoryManager::finalize should be called once the configuration of all the memory groups, kernels and functions is done, so that the memory manager can allocate the appropriate backing memory.
+
+@note @ref BlobLifetimeManager is currently implemented which models the memory requirements as a vector of distinct memory blobs.
+
+@subsection S4_7_2_working_with_memory_manager Working with the Memory Manager
+Using a memory manager to reduce the memory requirements of a pipeline can be summed in the following steps:
+
+Initially a memory manager must be set-up:
+@code{.cpp}
+Allocator  allocator{};                                                               // Create an allocator to use for the backing memory allocation
+auto lifetime_mgr  = std::make_shared<BlobLifetimeManager>();                         // Create Lifetime Manager
+auto pool_mgr      = std::make_shared<PoolManager>();                                 // Create Pool Manager
+auto mm            = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr); // Create Memory Manager
+@endcode
+
+Once done, memory groups can be registered to use the memory manager:
+@code{.cpp}
+MemoryGroup memory_group(mm); // Create a memory group and set the memory manager to use
+@endcode
+
+@note If a memory manager is not specified then all allocation will be immediate instead of deferred through the memory manager.
+
+Next step is to set objects to be managed by the memory group. It is important though to note that the lifetime of an object is tracked from the @ref MemoryGroup::manage() and the @ref TensorAllocator::allocate calls.
+@ref MemoryGroup::manage flags that the object will be needed starting now and when @ref TensorAllocator::allocate is called it signals the end of the object lifetime.
+@code{.cpp}
+Tensor tmp1, tmp2, tmp3;            // Create example tensors
+memory_group.manage(&tmp1);         // Start managing object tmp1 and start its lifetime
+memory_group.manage(&tmp2);         // Start managing object tmp2 and start its lifetime
+
+operation1.configure(&tmp1, &tmp2); // Configure a function/kernel using tmp1 and tmp2
+
+tmp1.allocator()->allocate();       // Flag that the lifetime of object tmp1 has ended
+
+memory_group.manage(&tmp3);         // Start managing object tmp3 and start its lifetime
+
+operation2.configure(&tmp2, &tmp3); // Configure a function/kernel using tmp2 and tmp3
+
+tmp2.allocator()->allocate();       // Flag that the lifetime of object tmp2 has ended
+tmp3.allocator()->allocate();       // Flag that the lifetime of object tmp3 has ended
+@endcode
+
+@warning The configuration step should be done sequentially by a single thread so that all the lifetimes are captured correclty.
+
+When configuration of all the operations is finished then the memory manager have to be finalized:
+@code{.cpp}
+mm->set_allocator(&allocator); // Set allocator to use
+mm->set_set_num_pools(2);      // Set number of pools to create in case parallel operations can be run
+mm->finalize();                // Finalize memory manager (Object lifetime check, Memory pool creation etc)
+@endcode
+
+Finally, during execution of the pipeline the memory of the appropriate memory group should be requested before running:
+@code{.cpp}
+memory_group.acquire(); // Request memory for the group
+
+operation1.run();       // Run operation1
+operation2.run();       // Run operation2
+
+memory_group.release(); // Release memory so that it can be reused
+@endcode
+@note Execution of a pipeline can be done in a multi-threading environment as memory acquisition/release are thread safe.
+
+@subsection S4_7_3_memory_manager_function_support Function support
+
+Most of the library's function have been ported to use @ref IMemoryManager for their internal temporary buffers.
+
+If that is the case, a memory manager can be passed to them during construction to reuse memory among these functions.
+@code{.cpp}
+// Setup Memory Manager
+CLBufferAllocator  allocator{};                                                       // Create an allocator to use for the backing memory allocation
+auto lifetime_mgr  = std::make_shared<BlobLifetimeManager>();                         // Create Lifetime Manager
+auto pool_mgr      = std::make_shared<PoolManager>();                                 // Create Pool Manager
+auto mm            = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr); // Create Memory Manager
+
+// Create two convolution layers and use the memory manager to manager their internal temporary buffers
+CLConvolutionLayer conv1(mm), conv2(mm);
+
+// Configure layers
+conv1.configure(...);
+conv2.configure(...);
+
+// Finalize memory manager
+mm->set_allocator(&allocator); // Set allocator to use
+mm->set_set_num_pools(1);      // Set number of pools to create in case parallel operations can be run
+mm->finalize();                // Finalize memory manager (Object lifetime check, Memory pool creation etc)
+
+// Run layers (Memory will be recycled for internal buffers for conv1 and conv2
+conv1.run();
+conv2.run();
+@endcode
 */
 } // namespace arm_compute

diff --git a/docs/02_tests.dox b/docs/02_tests.dox
index bf8838c..c39431f 100644
--- a/docs/02_tests.dox
+++ b/docs/02_tests.dox

@@ -1,95 +1,324 @@
+namespace arm_compute
+{
+namespace test
+{
 /**
 @page tests Validation and benchmarks tests
 
 @tableofcontents
 
-@section building_test_dependencies Building dependencies
+@section tests_overview Overview
 
-The tests currently make use of Boost (Test and Program options) for validation
-and Google Benchmark for performance runs. Below are instructions about how to
-build these 3rd party libraries.
+Benchmark and validation tests are based on the same framework to setup and run
+the tests. In addition to running simple, self-contained test functions the
+framework supports fixtures and data test cases. The former allows to share
+common setup routines between various backends thus reducing the amount of
+duplicated code. The latter can be used to parameterize tests or fixtures with
+different inputs, e.g. different tensor shapes. One limitation is that
+tests/fixtures cannot be parameterized based on the data type if static type
+information is needed within the test (e.g. to validate the results).
 
-@note By default the build of the validation and benchmark tests is disabled, to enable it use `validation_tests=1` and `benchmark_tests=1`
+@subsection tests_overview_fixtures Fixtures
 
-@subsection building_boost Building Boost
+Fixtures can be used to share common setup, teardown or even run tasks among
+multiple test cases. For that purpose a fixture can define a `setup`,
+`teardown` and `run` method. Additionally the constructor and destructor might
+also be customized.
 
-First follow the instructions from the Boost library on how to setup the Boost
-build system
-(http://www.boost.org/doc/libs/1_64_0/more/getting_started/index.html).
-Afterwards the required libraries can be build with:
+An instance of the fixture is created immediately before the actual test is
+executed. After construction the @ref framework::Fixture::setup method is called. Then the test
+function or the fixtures `run` method is invoked. After test execution the
+@ref framework::Fixture::teardown method is called and lastly the fixture is destructed.
 
-    ./b2 --with-program_options --with-test link=static \
-    define=BOOST_TEST_ALTERNATIVE_INIT_API
+@subsubsection tests_overview_fixtures_fixture Fixture
 
-Additionally, depending on your environment, it might be necessary to specify
-the ```toolset=``` option to choose the right compiler. Moreover,
-```address-model=32``` can be used to force building for 32bit and
-```target-os=android``` must be specified to build for Android.
+Fixtures for non-parameterized test are straightforward. The custom fixture
+class has to inherit from @ref framework::Fixture and choose to implement any of the
+`setup`, `teardown` or `run` methods. None of the methods takes any arguments
+or returns anything.
 
-After executing the build command the libraries
-```libboost_program_options.a``` and ```libboost_unit_test_framework.a``` can
-be found in ```./stage/lib```.
+    class CustomFixture : public framework::Fixture
+    {
+        void setup()
+        {
+            _ptr = malloc(4000);
+        }
 
-@subsection building_google_benchmark Building Google Benchmark
+        void run()
+        {
+            ARM_COMPUTE_ASSERT(_ptr != nullptr);
+        }
 
-Instructions on how to build Google Benchmark using CMake can be found in their
-repository: https://github.com/google/benchmark. For example, building for
-Android 32bit can be achieved via
+        void teardown()
+        {
+            free(_ptr);
+        }
 
-    cmake -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_CXX_COMPILER=arm-linux-androideabi-clang++ \
-    -DBENCHMARK_ENABLE_LTO=false -DBENCHMARK_ENABLE_TESTING=false ..
+        void *_ptr;
+    };
 
-The library required by the compute library is ```libbenchmark.a```.
+@subsubsection tests_overview_fixtures_data_fixture Data fixture
+
+The advantage of a parameterized fixture is that arguments can be passed to the setup method at runtime. To make this possible the setup method has to be a template with a type parameter for every argument (though the template parameter doesn't have to be used). All other methods remain the same.
+
+    class CustomFixture : public framework::Fixture
+    {
+    #ifdef ALTERNATIVE_DECLARATION
+        template <typename ...>
+        void setup(size_t size)
+        {
+            _ptr = malloc(size);
+        }
+    #else
+        template <typename T>
+        void setup(T size)
+        {
+            _ptr = malloc(size);
+        }
+    #endif
+
+        void run()
+        {
+            ARM_COMPUTE_ASSERT(_ptr != nullptr);
+        }
+
+        void teardown()
+        {
+            free(_ptr);
+        }
+
+        void *_ptr;
+    };
+
+@subsection tests_overview_test_cases Test cases
+
+All following commands can be optionally prefixed with `EXPECTED_FAILURE_` or
+`DISABLED_`.
+
+@subsubsection tests_overview_test_cases_test_case Test case
+
+A simple test case function taking no inputs and having no (shared) state.
+
+- First argument is the name of the test case (has to be unique within the
+  enclosing test suite).
+- Second argument is the dataset mode in which the test will be active.
+
+
+    TEST_CASE(TestCaseName, DatasetMode::PRECOMMIT)
+    {
+        ARM_COMPUTE_ASSERT_EQUAL(1 + 1, 2);
+    }
+
+@subsubsection tests_overview_test_cases_fixture_fixture_test_case Fixture test case
+
+A simple test case function taking no inputs that inherits from a fixture. The
+test case will have access to all public and protected members of the fixture.
+Only the setup and teardown methods of the fixture will be used. The body of
+this function will be used as test function.
+
+- First argument is the name of the test case (has to be unique within the
+  enclosing test suite).
+- Second argument is the class name of the fixture.
+- Third argument is the dataset mode in which the test will be active.
+
+
+    class FixtureName : public framework::Fixture
+    {
+        public:
+            void setup() override
+            {
+                _one = 1;
+            }
+
+        protected:
+            int _one;
+    };
+
+    FIXTURE_TEST_CASE(TestCaseName, FixtureName, DatasetMode::PRECOMMIT)
+    {
+        ARM_COMPUTE_ASSERT_EQUAL(_one + 1, 2);
+    }
+
+@subsubsection tests_overview_test_cases_fixture_register_fixture_test_case Registering a fixture as test case
+
+Allows to use a fixture directly as test case. Instead of defining a new test
+function the run method of the fixture will be executed.
+
+- First argument is the name of the test case (has to be unique within the
+  enclosing test suite).
+- Second argument is the class name of the fixture.
+- Third argument is the dataset mode in which the test will be active.
+
+
+    class FixtureName : public framework::Fixture
+    {
+        public:
+            void setup() override
+            {
+                _one = 1;
+            }
+
+            void run() override
+            {
+                ARM_COMPUTE_ASSERT_EQUAL(_one + 1, 2);
+            }
+
+        protected:
+            int _one;
+    };
+
+    REGISTER_FIXTURE_TEST_CASE(TestCaseName, FixtureName, DatasetMode::PRECOMMIT);
+
+
+@subsubsection tests_overview_test_cases_data_test_case Data test case
+
+A parameterized test case function that has no (shared) state. The dataset will
+be used to generate versions of the test case with different inputs.
+
+- First argument is the name of the test case (has to be unique within the
+  enclosing test suite).
+- Second argument is the dataset mode in which the test will be active.
+- Third argument is the dataset.
+- Further arguments specify names of the arguments to the test function. The
+  number must match the arity of the dataset.
+
+
+    DATA_TEST_CASE(TestCaseName, DatasetMode::PRECOMMIT, framework::make("Numbers", {1, 2, 3}), num)
+    {
+        ARM_COMPUTE_ASSERT(num < 4);
+    }
+
+@subsubsection tests_overview_test_cases_fixture_data_test_case Fixture data test case
+
+A parameterized test case that inherits from a fixture. The test case will have
+access to all public and protected members of the fixture. Only the setup and
+teardown methods of the fixture will be used. The setup method of the fixture
+needs to be a template and has to accept inputs from the dataset as arguments.
+The body of this function will be used as test function. The dataset will be
+used to generate versions of the test case with different inputs.
+
+- First argument is the name of the test case (has to be unique within the
+  enclosing test suite).
+- Second argument is the class name of the fixture.
+- Third argument is the dataset mode in which the test will be active.
+- Fourth argument is the dataset.
+
+
+    class FixtureName : public framework::Fixture
+    {
+        public:
+            template <typename T>
+            void setup(T num)
+            {
+                _num = num;
+            }
+
+        protected:
+            int _num;
+    };
+
+    FIXTURE_DATA_TEST_CASE(TestCaseName, FixtureName, DatasetMode::PRECOMMIT, framework::make("Numbers", {1, 2, 3}))
+    {
+        ARM_COMPUTE_ASSERT(_num < 4);
+    }
+
+@subsubsection tests_overview_test_cases_register_fixture_data_test_case Registering a fixture as data test case
+
+Allows to use a fixture directly as parameterized test case. Instead of
+defining a new test function the run method of the fixture will be executed.
+The setup method of the fixture needs to be a template and has to accept inputs
+from the dataset as arguments. The dataset will be used to generate versions of
+the test case with different inputs.
+
+- First argument is the name of the test case (has to be unique within the
+  enclosing test suite).
+- Second argument is the class name of the fixture.
+- Third argument is the dataset mode in which the test will be active.
+- Fourth argument is the dataset.
+
+
+    class FixtureName : public framework::Fixture
+    {
+        public:
+            template <typename T>
+            void setup(T num)
+            {
+                _num = num;
+            }
+
+            void run() override
+            {
+                ARM_COMPUTE_ASSERT(_num < 4);
+            }
+
+        protected:
+            int _num;
+    };
+
+    REGISTER_FIXTURE_DATA_TEST_CASE(TestCaseName, FixtureName, DatasetMode::PRECOMMIT, framework::make("Numbers", {1, 2, 3}));
+
+@section writing_tests Writing validation tests
+
+Before starting a new test case have a look at the existing ones. They should
+provide a good overview how test cases are structured.
+
+- The C++ reference needs to be added to `tests/validation/CPP/`. The
+  reference function is typically a template parameterized by the underlying
+  value type of the `SimpleTensor`. This makes it easy to specialise for
+  different data types.
+- If all backends have a common interface it makes sense to share the setup
+  code. This can be done by adding a fixture in
+  `tests/validation/fixtures/`. Inside of the `setup` method of a fixture
+  the tensors can be created and initialised and the function can be configured
+  and run. The actual test will only have to validate the results. To be shared
+  among multiple backends the fixture class is usually a template that accepts
+  the specific types (data, tensor class, function class etc.) as parameters.
+- The actual test cases need to be added for each backend individually.
+  Typically the will be multiple tests for different data types and for
+  different execution modes, e.g. precommit and nightly.
 
 @section tests_running_tests Running tests
 @subsection tests_running_tests_benchmarking Benchmarking
 @subsubsection tests_running_tests_benchmarking_filter Filter tests
 All tests can be run by invoking
 
-    ./arm_compute_benchmark -- ./data
+    ./arm_compute_benchmark ./data
 
 where `./data` contains the assets needed by the tests.
 
-If only a subset of the tests has to be executed the `--benchmark_filter` option takes a regular expression to select matching tests.
+If only a subset of the tests has to be executed the `--filter` option takes a
+regular expression to select matching tests.
 
-    ./arm_compute_benchmark --benchmark_filter=neon_bitwise_and ./data
+    ./arm_compute_benchmark --filter='NEON/.*AlexNet' ./data
 
-All available tests can be displayed with the `--benchmark_list_tests` switch.
+Additionally each test has a test id which can be used as a filter, too.
+However, the test id is not guaranteed to be stable when new tests are added.
+Only for a specific build the same the test will keep its id.
 
-    ./arm_compute_benchmark --benchmark_list_tests ./data
+    ./arm_compute_benchmark --filter-id=10 ./data
+
+All available tests can be displayed with the `--list-tests` switch.
+
+    ./arm_compute_benchmark --list-tests
+
+More options can be found in the `--help` message.
 
 @subsubsection tests_running_tests_benchmarking_runtime Runtime
-By default every test is run multiple *iterations* until a minimum time is reached. The minimum time (in seconds) can be controlled with the `--benchmark_min_time` flag. However, each test might have a hard coded value for the number of iterations or minimum execution time. In that case the command line argument is ignored for those specific tests.
-Additionally it is possible to specify multiple *repetitions* (`--benchmark_repetitions`) which will run each test multiple times (including the iterations). The average and standard deviation for all repetitions is automatically computed and reported.
+By default every test is run once on a single thread. The number of iterations
+can be controlled via the `--iterations` option and the number of threads via
+`--threads`.
 
-@subsubsection tests_running_tests_benchmarking_verbosity Verbosity
-The verbosity of the test output can be controlled via the `--v` flag. Though it should hardly ever be necessary.
+@subsubsection tests_running_tests_benchmarking_output Output
+By default the benchmarking results are printed in a human readable format on
+the command line. The colored output can be disabled via `--no-color-output`.
+As an alternative output format JSON is supported and can be selected via
+`--log-format=json`. To write the output to a file instead of stdout the
+`--log-file` option can be used.
 
 @subsection tests_running_tests_validation Validation
-@subsubsection tests_running_tests_validation_filter Filter tests
-All tests can be run by invoking
 
-    ./arm_compute_validation -- ./data
+@note The new validation tests have the same interface as the benchmarking tests.
 
-where `./data` contains the assets needed by the tests.
-
-As running all tests can take a lot of time the suite is split into "precommit" and "nightly" tests. The precommit tests will be fast to execute but still cover the most important features. In contrast the nightly tests offer more extensive coverage but take longer. The different subsets can be selected from the command line as follows:
-
-    ./arm_compute_validation -t @precommit -- ./data
-    ./arm_compute_validation -t @nightly -- ./data
-
-Additionally it is possible to select specific suites or tests:
-
-    ./arm_compute_validation -t CL -- ./data
-    ./arm_compute_validation -t NEON/BitwiseAnd/RunSmall/_0 -- ./data
-
-All available tests can be displayed with the `--list_content` switch.
-
-    ./arm_compute_validation --list_content -- ./data
-
-For a complete list of possible selectors please see: http://www.boost.org/doc/libs/1_64_0/libs/test/doc/html/boost_test/runtime_config/test_unit_filtering.html
-
-@subsubsection tests_running_tests_validation_verbosity Verbosity
-There are two separate flags to control the verbosity of the test output. `--report_level` controls the verbosity of the summary produced after all tests have been executed. `--log_level` controls the verbosity of the information generated during the execution of tests. All available settings can be found in the Boost documentation for [--report_level](http://www.boost.org/doc/libs/1_64_0/libs/test/doc/html/boost_test/utf_reference/rt_param_reference/report_level.html) and [--log_level](http://www.boost.org/doc/libs/1_64_0/libs/test/doc/html/boost_test/utf_reference/rt_param_reference/log_level.html), respectively.
 */
+} // namespace test
+} // namespace arm_compute

diff --git a/docs/03_scripts.dox b/docs/03_scripts.dox
new file mode 100644
index 0000000..5601428
--- /dev/null
+++ b/docs/03_scripts.dox

@@ -0,0 +1,92 @@
+/**
+@page data_import Importing data from existing models
+
+@tableofcontents
+
+@section caffe_data_extractor Extract data from pre-trained caffe model
+
+One can find caffe <a href="https://github.com/BVLC/caffe/wiki/Model-Zoo">pre-trained models</a> on
+caffe's official github repository.
+
+The caffe_data_extractor.py provided in the @ref scripts folder is an example script that shows how to
+extract the parameter values from a trained model.
+
+@note complex networks might require altering the script to properly work.
+
+@subsection caffe_how_to How to use the script
+
+Install caffe following <a href="http://caffe.berkeleyvision.org/installation.html">caffe's document</a>.
+Make sure the pycaffe has been added into the PYTHONPATH.
+
+Download the pre-trained caffe model.
+
+Run the caffe_data_extractor.py script by
+
+        python caffe_data_extractor.py -m <caffe model> -n <caffe netlist>
+
+For example, to extract the data from pre-trained caffe Alex model to binary file:
+
+        python caffe_data_extractor.py -m /path/to/bvlc_alexnet.caffemodel -n /path/to/caffe/models/bvlc_alexnet/deploy.prototxt
+
+The script has been tested under Python2.7.
+
+@subsection caffe_result  What is the expected output from the script
+
+If the script runs successfully, it prints the names and shapes of each layer onto the standard
+output and generates *.npy files containing the weights and biases of each layer.
+
+The @ref arm_compute::utils::load_trained_data shows how one could load
+the weights and biases into tensor from the .npy file by the help of Accessor.
+
+@section tensorflow_data_extractor Extract data from pre-trained tensorflow model
+
+The script tensorflow_data_extractor.py extracts trainable parameters (e.g. values of weights and biases) from a
+trained tensorflow model. A tensorflow model consists of the following two files:
+
+{model_name}.data-{step}-{global_step}: A binary file containing values of each variable.
+
+{model_name}.meta:  A binary file containing a MetaGraph struct which defines the graph structure of the neural
+network.
+
+@note Since Tensorflow version 0.11 the binary checkpoint file which contains the values for each parameter has the format of:
+    {model_name}.data-{step}-of-{max_step}
+instead of:
+    {model_name}.ckpt
+When dealing with binary files with version >= 0.11, only pass {model_name} to -m option;
+when dealing with binary files with version < 0.11, pass the whole file name {model_name}.ckpt to -m option.
+
+@note This script relies on the parameters to be extracted being in the
+'trainable_variables' tensor collection. By default all variables are automatically added to this collection unless
+specified otherwise by the user. Thus should a user alter this default behavior and/or want to extract parameters from other
+collections, tf.GraphKeys.TRAINABLE_VARIABLES should be replaced accordingly.
+
+@subsection tensorflow_how_to How to use the script
+
+Install tensorflow and numpy.
+
+Download the pre-trained tensorflow model.
+
+Run tensorflow_data_extractor.py with
+
+        python tensorflow_data_extractor -m <path_to_binary_checkpoint_file> -n <path_to_metagraph_file>
+
+For example, to extract the data from pre-trained tensorflow Alex model to binary files:
+
+        python tensorflow_data_extractor -m /path/to/bvlc_alexnet -n /path/to/bvlc_alexnet.meta
+
+Or for binary checkpoint files before Tensorflow 0.11:
+
+        python tensorflow_data_extractor -m /path/to/bvlc_alexnet.ckpt -n /path/to/bvlc_alexnet.meta
+
+@note with versions >= Tensorflow 0.11 only model name is passed to the -m option
+
+The script has been tested with Tensorflow 1.2, 1.3 on Python 2.7.6 and Python 3.4.3.
+
+@subsection tensorflow_result What is the expected output from the script
+
+If the script runs successfully, it prints the names and shapes of each parameter onto the standard output and generates
+ *.npy files containing the weights and biases of each layer.
+
+The @ref arm_compute::utils::load_trained_data shows how one could load
+the weights and biases into tensor from the .npy file by the help of Accessor.
+*/

diff --git a/docs/Doxyfile b/docs/Doxyfile
index 0876f3a..ee50981 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile

@@ -38,7 +38,7 @@
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = v17.06
+PROJECT_NUMBER         = 17.09
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -771,11 +771,14 @@
 INPUT                  = ./docs/00_introduction.dox \
                          ./docs/01_library.dox \
                          ./docs/02_tests.dox \
+                         ./docs/03_scripts.dox \
                          ./arm_compute/ \
+                         ./scripts/ \
                          ./src/core/CL/cl_kernels/ \
                          ./examples/ \
                          ./tests/ \
-                         ./utils/
+                         ./utils/ \
+                         ./support/
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -1054,7 +1057,7 @@
 # compiled with the --with-libclang option.
 # The default value is: NO.
 
-#CLANG_ASSISTED_PARSING = NO
+CLANG_ASSISTED_PARSING = NO
 
 # If clang assisted parsing is enabled you can provide the compiler with command
 # line options that you would normally use when invoking the compiler. Note that
@@ -1062,7 +1065,7 @@
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
-#CLANG_OPTIONS          = 
+CLANG_OPTIONS          = -std=c++11
 
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
commit	8938bd3f40ea62ff56d6ed4e2db0a8aee34dd64a	[log] [tgz]
author	Kaizen <kaizen@arm.com>	Thu Sep 28 14:38:23 2017 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Thu Sep 28 16:31:13 2017 +0100
tree	c234331232f227e0cdfb567a54ecaa5460aaa064
parent	f4a254c2745aeaab6f7276a675147d707002fe7a [diff]