arm_compute v20.11

commit: 49b8f9080cf2a24da986b6f156c7418ee3d28478 [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Fri Nov 27 12:49:11 2020 +0000
committer: Jenkins <bsgcomp@arm.com> Fri Nov 27 12:49:11 2020 +0000
tree: 55edcdfaa1ab1283bdff44e5b7c2c473cd11fbac
parent: 18b685f5d09ba07aa54e050f881a0befe3e36047 [diff]
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index bb1dfec..8eb0762 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox

@@ -69,158 +69,7 @@
  - A @ref utils folder containing headers with some boiler plate code used by the examples.
  - This documentation.
 
-You should have the following file organisation:
-
-	.
-	├── arm_compute --> All the arm_compute headers
-	│   ├── graph.h --> Includes all the Graph headers at once.
-	│   ├── core
-	│   │   ├── CL
-	│   │   │   ├── CLKernelLibrary.h --> Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
-	│   │   │   ├── CLKernels.h --> Includes all the OpenCL kernels at once
-	│   │   │   ├── CL specialisation of all the generic interfaces (ICLTensor, ICLArray, etc.)
-	│   │   │   ├── gemm --> Folder containing all the configuration files for GEMM
-	│   │   │   ├── kernels --> Folder containing all the OpenCL kernels
-	│   │   │   │   └── CL*Kernel.h
-	│   │   │   └── OpenCL.h --> Wrapper to configure the Khronos OpenCL C++ header
-	│   │   ├── CPP
-	│   │   │   ├── CPPKernels.h --> Includes all the CPP kernels at once
-	│   │   │   └── kernels --> Folder containing all the CPP kernels
-	│   │   │       └── CPP*Kernel.h
-	│   │   ├── GLES_COMPUTE
-	│   │   │   ├── GCKernelLibrary.h --> Manages all the GLES kernels compilation and caching, provides accessors for the GLES Context.
-	│   │   │   ├── GCKernels.h --> Includes all the GLES kernels at once
-	│   │   │   ├── GLES specialisation of all the generic interfaces (IGCTensor etc.)
-	│   │   │   ├── kernels --> Folder containing all the GLES kernels
-	│   │   │   │   └── GC*Kernel.h
-	│   │   │   └── OpenGLES.h --> Wrapper to configure the Khronos EGL and OpenGL ES C header
-	│   │   ├── NEON
-	│   │   │   ├── kernels --> Folder containing all the NEON kernels
-	│   │   │   │   ├── assembly --> headers for assembly optimised NEON kernels.
-	│   │   │   │   ├── convolution --> headers for convolution assembly optimised NEON kernels.
-	│   │   │   │   │   ├── common --> headers for code which is common to several convolution implementations.
-	│   │   │   │   │   ├── depthwise --> headers for Depthwise convolution assembly implementation
-	│   │   │   │   │   └── winograd --> headers for Winograd convolution assembly implementation
-	│   │   │   │   ├── detail --> Common code for several intrinsics implementations.
-	│   │   │   │   └── NE*Kernel.h
-	│   │   │   ├── wrapper --> NEON wrapper used to simplify code
-	│   │   │   │   ├── intrinsics --> NEON intrinsics wrappers
-	│   │   │   │   ├── scalar --> Scalar operations
-	│   │   │   │   ├── traits.h --> Traits defined on NEON vectors
-	│   │   │   │   └── wrapper.h --> Includes all wrapper headers at once
-	│   │   │   └── NEKernels.h --> Includes all the NEON kernels at once
-	│   │   ├── All common basic types (Types.h, Window, Coordinates, Iterator, etc.)
-	│   │   ├── All generic interfaces (ITensor, IArray, etc.)
-	│   │   └── Objects metadata classes (TensorInfo, MultiImageInfo)
-	│   ├── graph
-	│   │   ├── algorithms --> Generic algorithms used by the graph backend (e.g Order of traversal)
-	│   │   ├── backends --> The backend specific code
-	│   │   │   ├── CL --> OpenCL specific operations
-	│   │   │   ├── GLES  --> OpenGLES Compute Shaders specific operations
-	│   │   │   └── NEON --> NEON specific operations
-	│   │   ├── detail --> Collection of internal utilities.
-	│   │   ├── frontend --> Code related to the stream frontend interface.
-	│   │   ├── mutators --> Used to modify / optimise the Graph intermediate representation(Operator fusion, in place operations, etc.)
-	│   │   ├── nodes --> The various nodes supported by the graph API
-	│   │   ├── printers --> Debug printers
-	│   │   └── Graph objects interfaces (INode, ITensorAccessor, Graph, etc.)
-	│   └── runtime
-	│       ├── common
-	│       │   └── Common utility code used by all backends
-	│       ├── CL
-	│       │   ├── CL objects & allocators (CLArray, CLTensor, etc.)
-	│       │   ├── functions --> Folder containing all the OpenCL functions
-	│       │   │   └── CL*.h
-	│       │   ├── CLScheduler.h --> Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
-	│       │   ├── CLFunctions.h --> Includes all the OpenCL functions at once
-	│       │   ├── ICLTuner.h --> Interface used to tune the local work-group size of OpenCL kernels
-	│       │   └── tuners
-	│       │       └── Local workgroup size tuners for specific architectures / GPUs
-	│       ├── CPP
-	│       │   ├── CPPKernels.h --> Includes all the CPP functions at once.
-	│       │   ├── CPPScheduler.h --> Basic pool of threads to execute CPP/NEON code on several cores in parallel
-	│       │   └── functions --> Folder containing all the CPP functions
-	│       │       └── CPP*.h
-	│       ├── GLES_COMPUTE
-	│       │   ├── GLES objects & allocators (GCArray, GCTensor, etc.)
-	│       │   ├── functions --> Folder containing all the GLES functions
-	│       │   │   └── GC*.h
-	│       │   ├── GCScheduler.h --> Interface to enqueue GLES kernels and get/set the GLES CommandQueue.
-	│       │   └── GCFunctions.h --> Includes all the GLES functions at once
-	│       ├── NEON
-	│       │   ├── functions --> Folder containing all the NEON functions
-	│       │   │   └── NE*.h
-	│       │   └── NEFunctions.h --> Includes all the NEON functions at once
-	│       ├── OMP
-	│       │   └── OMPScheduler.h --> OpenMP scheduler (Alternative to the CPPScheduler)
-	│       ├── Memory & weights manager files (LifetimeManager, PoolManager, etc.)
-	│       └── Basic implementations of the generic object interfaces (Array, Tensor, etc.)
-	├── data --> Contains test images and reference data dumps used by validation tests
-	├── docs --> Contains Doxyfile and Doxygen sources used to generate the HTML pages.
-	├── examples
-	│   ├── gemm_tuner
-	│   │   └── OpenCL GEMM tuner utility
-	│   ├── cl_*.cpp --> OpenCL examples
-	│   ├── gc_*.cpp --> GLES compute shaders examples
-	│   ├── graph_*.cpp --> Graph examples
-	│   ├── neoncl_*.cpp --> NEON / OpenCL interoperability examples
-	│   └── neon_*.cpp --> NEON examples
-	├── include
-	│   ├── CL
-	│   │   └── Khronos OpenCL C headers and C++ wrapper
-	│   ├── half --> FP16 library available from http://half.sourceforge.net
-	│   ├── libnpy --> Library to load / write npy buffers, available from https://github.com/llohse/libnpy
-	│   ├── linux --> Headers only needed for Linux builds
-	│   │   └── Khronos EGL and OpenGLES headers
-	│   └── stb
-	│        └── stb_image.h --> Single header library to load image files, available from https://github.com/nothings/stb
-	├── scripts
-	│   ├── caffe_data_extractor.py --> Basic script to export weights from Caffe to npy files
-	│   └── tensorflow_data_extractor.py --> Basic script to export weights from Tensor Flow to npy files
-	├── src
-	│   ├── core
-	│   │   └── ... (Same structure as headers)
-	│   │       ├── CL
-	│   │       │   └── cl_kernels --> All the OpenCL kernels
-	│   │       └── GLES_COMPUTE
-	│   │           └── cs_shaders --> All the OpenGL ES Compute Shaders
-	│   ├── graph
-	│   │   └── ... (Same structure as headers)
-	│   └── runtime
-	│       └── ... (Same structure as headers)
-	├── support
-	│   └── Various headers to work around toolchains / platform issues.
-	├── tests
-	│   ├── All test related files shared between validation and benchmark
-	│   ├── benchmark --> Sources for benchmarking
-	│   │   ├── Benchmark specific files
-	│   │   ├── fixtures
-	│   │   │   └── Backend agnostic fixtures to initialise and run the functions to test.
-	│   │   ├── CL --> OpenCL benchmarking tests
-	│   │   ├── GLES_COMPUTE --> GLES benchmarking tests
-	│   │   └── NEON --> NEON benchmarking tests
-	│   ├── benchmark_examples --> Sources needed to wrap examples to run through our benchmarking framework.
-	│   ├── CL --> OpenCL accessors
-	│   ├── GLES_COMPUTE --> GLES accessors
-	│   ├── NEON --> NEON accessors
-	│   ├── datasets
-	│   │   └── Datasets for all the validation / benchmark tests, layer configurations for various networks, etc.
-	│   ├── framework
-	│   │   └── Boiler plate code for both validation and benchmark test suites (Command line parsers, instruments, output loggers, etc.)
-	│   ├── instruments --> User defined instruments that can be registered to the framework.
-	│   ├── validate_examples --> Sources needed to wrap examples to run through our validation framework.
-	│   └── validation --> Sources for validation
-	│       ├── Validation specific files
-	│       ├── fixtures
-	│       │   └── Backend agnostic fixtures to initialise and run the functions to test.
-	│       ├── reference
-	│       │   └── Reference implementation used to validate the results of the various backends.
-	│       ├── CL --> OpenCL validation tests
-	│       ├── GLES_COMPUTE --> GLES validation tests
-	│       ├── CPP --> C++ reference implementations
-	│       └── NEON --> NEON validation tests
-	└── utils --> Boiler plate code used by examples
-	    └── Various utilities to print types, load / store assets, etc.
+ For detailed information about file organization, please refer to Files -> File List section of this documentation.
 
 @section S2_versions_changelog Release versions and changelog
 
@@ -237,6 +86,308 @@
 
 @subsection S2_2_changelog Changelog
 
+v20.11 Public major release
+ - Various bug fixes.
+ - Various optimisations.
+ - Performance regressions can be noted when executing Depthwise Convolution on Neon with a depth multiplier > 1 for quantized data type.
+   This is planned to be resolved in 21.02 release.
+ - Added new data type QASYMM8_SIGNED support for @ref NEROIAlignLayer.
+ - Added new data type S32 support for:
+   - @ref NEArithmeticSubtraction
+   - @ref NEArithmeticSubtractionKernel
+   - @ref NEPixelWiseMultiplication
+   - @ref NEPixelWiseMultiplicationKernel
+   - @ref NEElementwiseDivision
+   - @ref NEDivisionOperationKernel
+ - Interface change
+   - Properly support softmax axis to have the same meaning as other major frameworks. That is, axis now defines the dimension
+     on which Softmax/Logsoftmax is performed. E.g. for input of shape 4x5x6 and axis=1, softmax will be applied to 4x6=24 vectors of size 5.
+     The supported value range of axis is [-rank, rank).
+     This change applies to the following functions:
+      - @ref NESoftmaxLayer
+      - @ref NELogSoftmaxLayer
+      - @ref CLSoftmaxLayer
+      - @ref CLLogSoftmaxLayer
+      - @ref GCSoftmaxLayer
+ - New OpenCL kernels / functions:
+   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+   - @ref CLLogicalNot
+   - @ref CLLogicalAnd
+   - @ref CLLogicalOr
+ - New NEON kernels / functions:
+   - @ref NELogicalNot
+   - @ref NELogicalAnd
+   - @ref NELogicalOr
+ - Removed padding from NEON kernels:
+   - @ref NEComplexPixelWiseMultiplicationKernel
+   - @ref NENonMaximaSuppression3x3Kernel
+   - @ref NERemapKernel
+   - @ref NEGEMMInterleave4x4Kernel
+   - @ref NEDirectConvolutionLayerKernel
+   - @ref NEScaleKernel
+   - @ref NELocallyConnectedMatrixMultiplyKernel
+   - @ref NEGEMMLowpOffsetContributionKernel
+   - @ref NEGEMMTranspose1xWKernel
+   - @ref NEPoolingLayerKernel
+   - @ref NEConvolutionKernel
+   - @ref NEDepthwiseConvolutionLayerNativeKernel
+   - @ref NEGEMMLowpMatrixMultiplyKernel
+   - @ref NEGEMMMatrixMultiplyKernel
+   - @ref NEDirectConvolutionLayerOutputStageKernel
+   - @ref NEReductionOperationKernel
+   - @ref NEGEMMLowpMatrixAReductionKernel
+   - @ref NEGEMMLowpMatrixBReductionKernel
+ - Removed padding from OpenCL kernels:
+   - @ref CLBatchConcatenateLayerKernel
+   - @ref CLElementwiseOperationKernel
+   - @ref CLBatchNormalizationLayerKernel
+   - @ref CLPoolingLayerKernel
+   - @ref CLWinogradInputTransformKernel
+   - @ref CLGEMMLowpMatrixMultiplyNativeKernel
+   - @ref CLGEMMLowpMatrixAReductionKernel
+   - @ref CLGEMMLowpMatrixBReductionKernel
+   - @ref CLGEMMLowpOffsetContributionOutputStageKernel
+   - @ref CLGEMMLowpOffsetContributionKernel
+   - @ref CLWinogradOutputTransformKernel
+   - @ref CLGEMMLowpMatrixMultiplyReshapedKernel
+   - @ref CLFuseBatchNormalizationKernel
+   - @ref CLDepthwiseConvolutionLayerNativeKernel
+   - @ref CLDepthConvertLayerKernel
+   - @ref CLCopyKernel
+   - @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
+   - @ref CLActivationLayerKernel
+   - @ref CLWinogradFilterTransformKernel
+   - @ref CLWidthConcatenateLayerKernel
+   - @ref CLWidthConcatenate4TensorsKernel
+   - @ref CLWidthConcatenate2TensorsKernel
+   - @ref CLLogits1DMaxShiftExpSumKernel
+   - @ref CLLogits1DNormKernel
+   - @ref CLHeightConcatenateLayerKernel
+   - @ref CLGEMMMatrixMultiplyKernel
+   - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
+   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
+   - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
+   - @ref CLDepthConcatenateLayerKernel
+   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+ - Removed OpenCL kernels / functions:
+   - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+   - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+   - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ - Deprecated OpenCL kernels / functions (If a kernel is used only by the function that is being deprecated, the kernel is deprecated together):
+     - CLLocallyConnectedLayer
+     - CLLocallyConnectedMatrixMultiplyKernel
+     - CLAbsoluteDifference
+     - CLAbsoluteDifferenceKernel
+     - CLAccumulate
+     - CLAccumulateKernel
+     - CLAccumulateSquared
+     - CLAccumulateSquaredKernel
+     - CLAccumulateWeighted
+     - CLAccumulateWeightedKernel
+     - CLAccumulateWeightedFP16Kernel
+     - CLBox3x3
+     - CLBox3x3Kernel
+     - CLBox3x3FP16Kernel
+     - CLCannyEdge
+     - CLChannelCombine
+     - CLChannelCombineKernel
+     - CLChannelExtract
+     - CLChannelExtractKernel
+     - CLColorConvert
+     - CLColorConvertKernel
+     - CLConvolution3x3
+     - CLConvolutionRectangle
+     - CLConvolutionRectangleKernel
+     - CLConvolutionSquare
+     - CLConvolutionKernel
+     - CLDerivative
+     - CLDerivativeKernel
+     - CLDilate
+     - CLDilateKernel
+     - CLEqualizeHistogram
+     - CLErode
+     - CLErodeKernel
+     - CLFastCorners
+     - CLFastCornersKernel
+     - CLGaussian3x3
+     - CLGaussian3x3Kernel
+     - CLGaussian5x5
+     - CLGaussian5x5HorKernel
+     - CLGaussian5x5VertKernel
+     - CLGaussianPyramid
+     - CLGaussianPyramidHalf
+     - CLGaussianPyramidOrb
+     - CLHarrisCorners
+     - CLHarrisScoreKernel
+     - CLHarrisScoreFP16Kernel
+     - CLHistogram
+     - CLHistogramKernel
+     - CLHOGOrientationBinningKernel
+     - CLHOGBlockNormalizationKernel
+     - CLHOGDetectorKernel
+     - CLHOGNonMaximaSuppressionKernel
+     - CLHOGDescriptor
+     - CLHOGDetector
+     - CLHOGGradient
+     - CLHOGMultiDetection
+     - CLHOGOrientationBinningKernel
+     - CLHOGBlockNormalizationKernel
+     - CLHOGDetectorKernel
+     - CLIntegralImage
+     - CLIntegralImageKernel
+     - CLLaplacianReconstruct
+     - CLLaplacianPyramid
+     - CLMagnitude
+     - CLMagnitudePhaseKernel
+     - CLMedian3x3
+     - CLMedian3x3Kernel
+     - CLMinMaxLocation
+     - CLMinMaxLocationKernel
+     - CLNonLinearFilter
+     - CLNonLinearFilterKernel
+     - CLNonMaximaSuppression3x3
+     - CLNonMaximaSuppression3x3FP16Kernel
+     - CLNonMaximaSuppression3x3Kernel
+     - CLOpticalFlow
+     - CLPhase
+     - CLRemap
+     - CLRemapKernel
+     - CLScharr3x3
+     - CLScharr3x3Kernel
+     - CLSobel3x3
+     - CLSobel3x3Kernel
+     - CLSobel5x5
+     - CLSobel5x5HorKernel
+     - CLSobel5x5VertKernel
+     - CLSobel7x7
+     - CLSobel7x7HorKernel
+     - CLSobel7x7VertKernel
+     - CLThreshold
+     - CLThresholdKernel
+     - CLWarpAffine
+     - CLWarpAffineKernel
+     - CLWarpPerspective
+     - CLWarpPerspectiveKernel
+ - Deprecated NEON kernels / functions (If a kernel is used only by the function that is being deprecated, the kernel is deprecated together):
+     - NELocallyConnectedLayer
+     - NELocallyConnectedMatrixMultiplyKernel
+     - NEAbsoluteDifference
+     - NEAbsoluteDifferenceKernel
+     - NEAccumulate
+     - NEAccumulateKernel
+     - NEAccumulateSquared
+     - NEAccumulateSquaredKernel
+     - NEAccumulateWeighted
+     - NEAccumulateWeightedKernel
+     - NEAccumulateWeightedFP16Kernel
+     - NEBox3x3
+     - NEBox3x3Kernel
+     - NEBox3x3FP16Kernel
+     - NECannyEdge
+     - NEChannelCombine
+     - NEChannelCombineKernel
+     - NEChannelExtract
+     - NEChannelExtractKernel
+     - NEColorConvert
+     - NEColorConvertKernel
+     - NEConvolution3x3
+     - NEConvolutionRectangle
+     - NEConvolutionRectangleKernel
+     - NEConvolutionSquare
+     - NEConvolutionKernel
+     - NEDerivative
+     - NEDerivativeKernel
+     - NEDilate
+     - NEDilateKernel
+     - NEEqualizeHistogram
+     - NEErode
+     - NEErodeKernel
+     - NEFastCorners
+     - NEFastCornersKernel
+     - NEGaussian3x3
+     - NEGaussian3x3Kernel
+     - NEGaussian5x5
+     - NEGaussian5x5HorKernel
+     - NEGaussian5x5VertKernel
+     - NEGaussianPyramid
+     - NEGaussianPyramidHalf
+     - NEGaussianPyramidOrb
+     - NEHarrisCorners
+     - NEHarrisScoreKernel
+     - NEHarrisScoreFP16Kernel
+     - NEHistogram
+     - NEHistogramKernel
+     - NEHOGOrientationBinningKernel
+     - NEHOGBlockNormalizationKernel
+     - NEHOGDetectorKernel
+     - NEHOGNonMaximaSuppressionKernel
+     - NEHOGDescriptor
+     - NEHOGDetector
+     - NEHOGGradient
+     - NEHOGMultiDetection
+     - NEHOGOrientationBinningKernel
+     - NEHOGBlockNormalizationKernel
+     - NEHOGDetectorKernel
+     - NEIntegralImage
+     - NEIntegralImageKernel
+     - NELaplacianReconstruct
+     - NELaplacianPyramid
+     - NEMagnitude
+     - NEMagnitudePhaseKernel
+     - NEMedian3x3
+     - NEMedian3x3Kernel
+     - NEMinMaxLocation
+     - NEMinMaxLocationKernel
+     - NENonLinearFilter
+     - NENonLinearFilterKernel
+     - NENonMaximaSuppression3x3
+     - NENonMaximaSuppression3x3FP16Kernel
+     - NENonMaximaSuppression3x3Kernel
+     - NEOpticalFlow
+     - NEPhase
+     - NERemap
+     - NERemapKernel
+     - NEScharr3x3
+     - NEScharr3x3Kernel
+     - NESobel3x3
+     - NESobel3x3Kernel
+     - NESobel5x5
+     - NESobel5x5HorKernel
+     - NESobel5x5VertKernel
+     - NESobel7x7
+     - NESobel7x7HorKernel
+     - NESobel7x7VertKernel
+     - NEThreshold
+     - NEThresholdKernel
+     - NEWarpAffine
+     - NEWarpAffineKernel
+     - NEWarpPerspective
+     - NEWarpPerspectiveKernel
+ - Deprecated GLES kernels / functions (If a kernel is used only by the function that is being deprecated, the kernel is deprecated together):
+     - GCAbsoluteDifference
+     - GCActivationLayer
+     - GCArithmeticAddition
+     - GCBatchNormalizationLayer
+     - GCConcatenateLayer
+     - GCConvolutionLayer
+     - GCDepthwiseConvolutionLayer
+     - GCDirectConvolutionLayer
+     - GCDropoutLayer
+     - GCFillBorder
+     - GCFullyConnectedLayer
+     - GCGEMM
+     - GCGEMMInterleave4x4
+     - GCGEMMTranspose1xW
+     - GCNormalizationLayer
+     - GCNormalizePlanarYUVLayer
+     - GCPixelWiseMultiplication
+     - GCPoolingLayer
+     - GCScale
+     - GCSoftmaxLayer
+     - GCTensorShift
+     - GCTranspose
+
+
 v20.08 Public major release
  - Various bug fixes.
  - Various optimisations.
@@ -284,7 +435,7 @@
    - @ref NEDepthConvertLayerKernel
    - @ref NERangeKernel
    - @ref NEPriorBoxLayer
- - Removedd OpenCL kernels / functions:
+ - Removed OpenCL kernels / functions:
    - CLGEMMLowpQuantizeDownInt32ToUint8Scale
    - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat
  - Removed NEON kernels / functions:
@@ -394,7 +545,7 @@
      - @ref NEComparisonOperationKernel
      - @ref NEConvolutionLayer
      - @ref NEDepthwiseConvolutionLayer
-     - @ref NEDepthwiseConvolutionLayer3x3Kernel
+     - NEDepthwiseConvolutionLayer3x3Kernel
      - @ref NEDirectConvolutionLayerOutputStageKernel
      - @ref NEElementwiseComparison
      - @ref NEElementwiseMax
@@ -406,13 +557,13 @@
      - @ref NEPoolingLayer
      - @ref NEPReluLayer
  - Added support for QSYMM8_PER_CHANNEL in:
-     - @ref NEDepthwiseConvolutionLayer3x3Kernel
+     - NEDepthwiseConvolutionLayer3x3Kernel
  - Added support for split sizes in:
      - @ref CLSplit
      - @ref NESplit
  - New OpenCL kernels / functions:
      - @ref CLFill
-     - @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
+     - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
  - New NEON kernels / functions:
      - @ref NEFill
      - @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
@@ -546,7 +697,7 @@
     - @ref CLBatchConcatenateLayerKernel
     - @ref CLDepthToSpaceLayerKernel / @ref CLDepthToSpaceLayer
     - @ref CLGEMMLowpMatrixMultiplyNativeKernel
-    - @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+    - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
     - @ref CLGEMMMatrixMultiplyNativeKernel
     - @ref CLMeanStdDevNormalizationKernel / @ref CLMeanStdDevNormalizationLayer
     - @ref CLSpaceToDepthLayerKernel / @ref CLSpaceToDepthLayer
@@ -774,11 +925,11 @@
     - @ref CLL2NormalizeLayer
  - Added QASYMM8 support to the following kernels:
     - @ref CLScaleKernel
-    - @ref NEDepthwiseConvolutionLayer3x3Kernel
+    - NEDepthwiseConvolutionLayer3x3Kernel
     - @ref CLPixelWiseMultiplicationKernel
  - Added FP16 support to the following kernels:
     - @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
-    - @ref NEDepthwiseConvolutionLayer3x3Kernel
+    - NEDepthwiseConvolutionLayer3x3Kernel
     - @ref CLNormalizePlanarYUVLayerKernel
     - @ref CLWinogradConvolutionLayer (5x5 kernel)
  - More tests added to both validation and benchmarking suites.
@@ -929,7 +1080,7 @@
  - Refactored NEON Winograd (NEWinogradLayerKernel)
  - Added @ref NEDirectConvolutionLayerOutputStageKernel
  - Added QASYMM8 support to the following NEON kernels:
-    - @ref NEDepthwiseConvolutionLayer3x3Kernel
+    - NEDepthwiseConvolutionLayer3x3Kernel
     - @ref NEFillBorderKernel
     - @ref NEPoolingLayerKernel
  - Added new examples:
@@ -973,14 +1124,14 @@
  - New NEON kernels / functions
     - arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore
     - arm_compute::NEHGEMMAArch64FP16Kernel
-    - @ref NEDepthwiseConvolutionLayer3x3Kernel / NEDepthwiseIm2ColKernel / NEGEMMMatrixVectorMultiplyKernel / NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer
+    - NEDepthwiseConvolutionLayer3x3Kernel / NEDepthwiseIm2ColKernel / NEGEMMMatrixVectorMultiplyKernel / NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer
     - @ref NEGEMMLowpOffsetContributionKernel / @ref NEGEMMLowpMatrixAReductionKernel / @ref NEGEMMLowpMatrixBReductionKernel / @ref NEGEMMLowpMatrixMultiplyCore
     - @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
     - NEWinogradLayer / NEWinogradLayerKernel
 
  - New OpenCL kernels / functions
     - @ref CLGEMMLowpOffsetContributionKernel / @ref CLGEMMLowpMatrixAReductionKernel / @ref CLGEMMLowpMatrixBReductionKernel / @ref CLGEMMLowpMatrixMultiplyCore
-    - @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
+    - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
 
  - New graph nodes for NEON and OpenCL
     - graph::BranchLayer
@@ -1328,7 +1479,7 @@
 
 The examples get automatically built by scons as part of the build process of the library described above. This section just describes how you can build and link your own application against our library.
 
-@note The following command lines assume the arm_compute binaries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built library with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.
+@note The following command lines assume the arm_compute libraries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built libraries with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.
 
 To cross compile a NEON example for Linux 32bit:
 
@@ -1433,9 +1584,9 @@
 
 Here is a guide to <a href="https://developer.android.com/ndk/guides/standalone_toolchain.html">create your Android standalone toolchains from the NDK</a>
 
-- Download the NDK r18b from here: https://developer.android.com/ndk/downloads/index.html
+- Download the NDK r18b from here: https://developer.android.com/ndk/downloads/index.html to directory $NDK
 - Make sure you have Python 2.7 installed on your machine.
-- Generate the 32 and/or 64 toolchains by running the following commands:
+- Generate the 32 and/or 64 toolchains by running the following commands to your toolchain dirctory $MY_TOOLCHAINS:
 
 
 	$NDK/build/tools/make_standalone_toolchain.py --arch arm64 --install-dir $MY_TOOLCHAINS/aarch64-linux-android-ndk-r18b --stl libc++ --api 21
@@ -1465,7 +1616,7 @@
 
 The examples get automatically built by scons as part of the build process of the library described above. This section just describes how you can build and link your own application against our library.
 
-@note The following command lines assume the arm_compute binaries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built library with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.
+@note The following command lines assume the arm_compute libraries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built libraries with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.
 
 Once you've got your Android standalone toolchain built and added to your path you can do the following:
 
@@ -1649,7 +1800,7 @@
 
 @subsubsection S3_7_1_cl_tuner_how_to How to use it
 
-All the graph examples in the ACL's folder "examples" and the arm_compute_benchmark accept an argument to enable the OpenCL tuner and an argument to export/import the LWS values to/from a file
+All the graph examples in the Compute Library's folder "examples" and the arm_compute_benchmark accept an argument to enable the OpenCL tuner and an argument to export/import the LWS values to/from a file
 
     #Enable CL tuner
     ./graph_mobilenet --enable-tuner –-target=CL

diff --git a/docs/01_library.dox b/docs/01_library.dox
index ea29b75..742a246 100644
--- a/docs/01_library.dox
+++ b/docs/01_library.dox

@@ -43,7 +43,34 @@
 
 For maximum performance, it is expected that the users would re-implement an equivalent to the runtime library which suits better their needs (With a more clever multi-threading strategy, load-balancing between NEON and OpenCL, etc.)
 
-@section S4_1_2 Thread-safety
+@section S4_1_2 Data-type and Data-layout support
+
+Compute Library supports a wide list of data-types, information can been directly found in the documentation of each kernel/function.
+The main data-types that the Machine Learning functions support are the following:
+- BFLOAT16: 16-bit non-standard brain floating point
+- F16: 16-bit half precision floating point
+- F32: 32-bit single precision floating point
+- QASYMM8: 8-bit unsigned asymmetric quantized
+- QASYMM8_SIGNED: 8-bit signed asymmetric quantized
+- QSYMM8_PER_CHANNEL: 8-bit signed symmetric quantized (Used for the weights)
+
+Moreover, Compute Library supports the following data layouts (fast changing dimension from right to left):
+- NHWC: The native layout of Compute Library that delivers the best performance where channels are in the fastest changing dimension
+- NCHW: Legacy layout where width is in the fastest changing dimension
+where N = batches, C = channels, H = height, W = width
+
+@section S4_1_3 Fast-math support
+
+Compute Library supports different types of convolution methods, fast-math flag is only used for the Winograd algorithm.
+When the fast-math flag is enabled, both NEON and CL convolution layers will try to dispatch the fastest implementation available, which may introduce a drop in accuracy as well. The different scenarios involving the fast-math flag are presented below:
+- For FP32:
+    - no-fast-math: Only supports Winograd 3x3,3x1,1x3,5x1,1x5,7x1,1x7
+    - fast-math: Supports Winograd 3x3,3x1,1x3,5x1,1x5,7x1,1x7,5x5,7x7
+- For fp16:
+    - no-fast-math: No Winograd support
+    - fast-math: Supports Winograd 3x3,3x1,1x3,5x1,1x5,7x1,1x7,5x5,7x7
+
+@section S4_1_4 Thread-safety
 
 Although the library supports multi-threading during workload dispatch, thus parallelizing the execution of the workload at multiple threads, the current runtime module implementation is not thread-safe in the sense of executing different functions from separate threads.
 This lies to the fact that the provided scheduling mechanism wasn't designed with thread-safety in mind.

diff --git a/docs/02_tests.dox b/docs/02_tests.dox
index a813844..c46e1f5 100644
--- a/docs/02_tests.dox
+++ b/docs/02_tests.dox

@@ -45,28 +45,6 @@
 
 @note Tests are not included in the pre-built binary archive, you have to build them from sources.
 
-@subsection tests_overview_structure Directory structure
-
-    .
-    `-- tests <- Top level test directory. All files in here are shared among validation and benchmark.
-        |-- framework <- Underlying test framework.
-        |-- CL             \
-        |-- GLES_COMPUTE   \
-        |-- NEON -> Backend specific files with helper functions etc.
-        |-- benchmark <- Top level directory for the benchmarking files.
-        |   |-- fixtures <- Fixtures for benchmark tests.
-        |   |-- CL <- OpenCL backend test cases on a function level.
-        |   |-- GLES_COMPUTE <- Same of OpenGL ES
-        |   `-- NEON <- Same for NEON
-        |-- datasets <- Datasets for benchmark and validation tests.
-        |-- main.cpp <- Main entry point for the tests. Currently shared between validation and benchmarking.
-        `-- validation -> Top level directory for validation files.
-            |-- CPP -> C++ reference code
-            |-- CL             \
-            |-- GLES_COMPUTE   \
-            |-- NEON -> Backend specific test cases
-            `-- fixtures -> Fixtures shared among all backends. Used to setup target function and tensors.
-
 @subsection tests_overview_fixtures Fixtures
 
 Fixtures can be used to share common setup, teardown or even run tasks among

diff --git a/docs/03_scripts.dox b/docs/03_scripts.dox
index efa6fa9..7e16edf 100644
--- a/docs/03_scripts.dox
+++ b/docs/03_scripts.dox

@@ -143,6 +143,11 @@
 the weights and biases into tensor from the .npy file by the help of Accessor.
 
 @section validate_examples Validating examples
+
+Compute Library provides a list of graph examples that are used in the context of integration and performance testing.
+The provenance of each model is part of its documentation and no structural or data alterations have been applied to any
+of them unless explicitly specified otherwise in the documentation.
+
 Using one of the provided scripts will generate files containing the trainable parameters.
 
 You can validate a given graph example on a list of inputs by running:

diff --git a/docs/04_adding_operator.dox b/docs/04_adding_operator.dox
index c40aaa3..13be712 100644
--- a/docs/04_adding_operator.dox
+++ b/docs/04_adding_operator.dox

@@ -30,7 +30,7 @@
 @tableofcontents
 
 @section S4_1_introduction Introduction
-In ACL there are two main parts or modules:
+In Compute Library there are two main parts or modules:
 - The core library consists of a low-level collection of algorithms implemented in C++ and optimized for Arm CPUs and GPUs. The core module is designed to be embedded in other projects and it doesn't perform any memory management or scheduling.
 - The runtime library is a wrapper of the core library and provides other additional features like memory management, multithreaded execution of workloads and allocation of the intermediate tensors.
 
@@ -41,7 +41,7 @@
 
 @section S4_1_supporting_new_operators Supporting new operators
 
-Following are the steps involved in adding support for a new operator in ACL
+Following are the steps involved in adding support for a new operator in Compute Library
 - Add new data types (if required)
 - Add the kernel to the core library.
 - Add the function to the runtime library.
@@ -52,7 +52,7 @@
 
 @subsection S4_1_1_add_datatypes Adding new data types
 
-The ACL declares a few new datatypes related to ACL's domain, kernels, and functions in the library process Tensors and Images (Computer Vision functions). Tensors are multi-dimensional arrays with a maximum of Coordinates::num_max_dimensions dimensions; depending on the number of dimensions tensors can be interpreted as various objects. A scalar can be represented as a zero-dimensional tensor and a vector of numbers can be represented as a one-dimensional tensor. Furthermore, an image is just a 2D tensor, a 3D tensor can be seen as an array of images and a 4D tensor as a 2D array of images, etc.
+Compute Library declares a few new datatypes related to its domain, kernels, and functions in the library process Tensors and Images (Computer Vision functions). Tensors are multi-dimensional arrays with a maximum of Coordinates::num_max_dimensions dimensions; depending on the number of dimensions tensors can be interpreted as various objects. A scalar can be represented as a zero-dimensional tensor and a vector of numbers can be represented as a one-dimensional tensor. Furthermore, an image is just a 2D tensor, a 3D tensor can be seen as an array of images and a 4D tensor as a 2D array of images, etc.
 All the datatype classes or structures are grouped in the core library folder arm_compute/core  like the @ref ITensor, @ref ITensorInfo (all the information of a tensor), TensorShape and simpler types are in arm_compute/core/Types.h.
 
 If an operator handles a new datatype, it must be added to the library. While adding a new data type to the library, it's necessary to implement the function to enable printing, the to_string() method and the output stream insertion (<<) operator. Every datatype implements these two functions in utils/TypePrinter.h
@@ -65,13 +65,13 @@
 
 @snippet utils/TypePrinter.h Print DataLayout type
 
-In the ACL library, we use namespaces to group all the operators, functions, classes and interfaces. The main namespace to use is arm_compute. In the test suite, the test framework and the individual tests use nested namespaces like @ref test::validation or @ref test::benchmark to group the different purposes of various parts of the suite.
+In Compute Library, we use namespaces to group all the operators, functions, classes and interfaces. The main namespace to use is arm_compute. In the test suite, the test framework and the individual tests use nested namespaces like @ref test::validation or @ref test::benchmark to group the different purposes of various parts of the suite.
 Utility functions like conversion or type cast operators, that are shared by multiple operators are in arm_compute/core/Utils.h. Non-inlined function definitions go in the corresponding .cpp files in the src folder.
 Similarly, all common functions that process shapes, like calculating output shapes of an operator or shape conversions etc are in arm_compute/core/utils/misc/ShapeCalculator.h.
 
 
 @subsection S4_1_2_add_kernel Add a kernel
-As we mentioned at the beginning, the kernel is the implementation of the operator or algorithm partially using a specific programming language related to the backend we want to use. Adding a kernel in the library means implementing the algorithm in a SIMD technology like NEON or OpenCL. All kernels in ACL must implement a common interface IKernel or one of the specific subinterfaces.
+As we mentioned at the beginning, the kernel is the implementation of the operator or algorithm partially using a specific programming language related to the backend we want to use. Adding a kernel in the library means implementing the algorithm in a SIMD technology like NEON or OpenCL. All kernels in Compute Library must implement a common interface IKernel or one of the specific subinterfaces.
 IKernel is the common interface for all the kernels in the core library, it contains the main methods for configure and run the kernel itself, such as window()  that return the maximum window the kernel can be executed on or is_parallelisable() for indicate whether or not the kernel is parallelizable. If the kernel is parallelizable then the window returned by the window() method can be split into sub-windows which can then be run in parallel, in the other case, only the window returned by window() can be passed to the run method.
 There are specific interfaces for OpenCL and Neon: @ref ICLKernel, INEKernel (using INEKernel = @ref ICPPKernel).
 
@@ -80,7 +80,7 @@
 
 There are two others implementation of @ref IKernel called @ref ICLSimpleKernel and INESimpleKernel, they are the interface for simple kernels that have just one input tensor and one output tensor.
 Creating a new kernel implies adding new files:
-- arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
+- src/core/CL/kernels/CLReshapeLayerKernel.h
 - src/core/CL/cl_kernels/reshape_layer.cl
 - src/core/CL/kernels/CLReshapeLayerKernel.cpp
 - src/core/CL/CLKernelLibrary.cpp
@@ -90,16 +90,16 @@
 - src/core/NEON/kernels/NEReshapeLayerKernel.cpp
 
 We must register the new layer in the respective libraries:
-- arm_compute/core/CL/CLKernels.h
+- src/core/CL/CLKernels.h
 - arm_compute/core/NEON/NEKernels.h
 
-These files contain the list of all kernels available in the corresponding ACL's backend, for example CLKernels:
+These files contain the list of all kernels available in the corresponding Compute Library's backend, for example CLKernels:
 @code{.cpp}
 ... 
-#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
 ... 
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "src/core/CL/kernels/CLReshapeLayerKernel.h"
 ... 
 
 @endcode
@@ -138,7 +138,7 @@
 - (sub[n].end() - sub[n].start()) % max[n].step() == 0
 
 @ref CPPScheduler::schedule provides a sample implementation that is used for NEON kernels.
-%Memory management is the other aspect that the runtime layer is supposed to handle. %Memory management of the tensors is abstracted using TensorAllocator. Each tensor holds a pointer to a TensorAllocator object, which is used to allocate and free the memory at runtime. The implementation that is currently supported in ACL allows memory blocks, required to be fulfilled for a given operator, to be grouped together under a @ref MemoryGroup. Each group can be acquired and released. The underlying implementation of memory groups vary depending on whether NEON or CL is used. The memory group class uses memory pool to provide the required memory. It also uses the memory manager to manage the lifetime and a IPoolManager to manage the memory pools registered with the memory manager.
+%Memory management is the other aspect that the runtime layer is supposed to handle. %Memory management of the tensors is abstracted using TensorAllocator. Each tensor holds a pointer to a TensorAllocator object, which is used to allocate and free the memory at runtime. The implementation that is currently supported in Compute Library allows memory blocks, required to be fulfilled for a given operator, to be grouped together under a @ref MemoryGroup. Each group can be acquired and released. The underlying implementation of memory groups vary depending on whether NEON or CL is used. The memory group class uses memory pool to provide the required memory. It also uses the memory manager to manage the lifetime and a IPoolManager to manage the memory pools registered with the memory manager.
 
 
 We have seen the various interfaces for a kernel in the core library, the same structure the same file structure design exists in the runtime module. IFunction is the base class for all the functions, it has two child interfaces: ICLSimpleFunction and INESimpleFunction that are used as base class for functions which call a single kernel.
@@ -268,7 +268,7 @@
 
 where we will put respectively the declaration and definition of the new operator.
 All the utility functions that are used ONLY in the tests are in test/validation/helpers.h, for all the others, as mentioned before, there are helpers in the library.
-ACL and the tests do use templates, the reference implementation is a generic implementation independent from the datatype and we use the templates to generalize the datatype concept.
+Compute Library and the tests do use templates, the reference implementation is a generic implementation independent from the datatype and we use the templates to generalize the datatype concept.
 Following the example, let's have a look at the ReshapeLayer operator:
 
 - tests/validation/reference/ReshapeLayer.h

diff --git a/docs/05_contribution_guidelines.dox b/docs/05_contribution_guidelines.dox
index abe0bc9..1cdd129 100644
--- a/docs/05_contribution_guidelines.dox
+++ b/docs/05_contribution_guidelines.dox

@@ -358,6 +358,52 @@
 - **Sanitize data sent to other systems**. Sanitize all data passed to complex subsystems such as command shells, relational databases, and commercial off-the-shelf (COTS) components. Attackers may be able to invoke unused functionality in these components through the use of various injection attacks. This is not necessarily an input validation problem because the complex subsystem being invoked does not understand the context in which the call is made. Because the calling process understands the context, it is responsible for sanitizing the data before invoking the subsystem.
 - **Practice defense in depth**. Manage risk with multiple defensive strategies, so that if one layer of defense turns out to be inadequate, another layer of defense can prevent a security flaw from becoming an exploitable vulnerability and/or limit the consequences of a successful exploit. For example, combining secure programming techniques with secure runtime environments should reduce the likelihood that vulnerabilities remaining in the code at deployment time can be exploited in the operational environment.
 
+@subsection S5_1_5_guidelines_for_stable_api_abi Guidelines for stable API/ABI
+
+The Application Programming Interface (API) and Application Binary Interface (ABI) are the interfaces exposed
+to users so their programs can interact with the library efficiently and effectively. Even though changing API/ABI
+in a way that does not give backward compatibility is not necessarily bad if it can improve other users' experience and the library,
+contributions should be made with the awareness of API/ABI stability. If you'd like to make changes that affects
+the library's API/ABI, please review and follow the guidelines shown in this section. Also, please note that
+these guidelines are not exhaustive list but discussing things that might be easily overlooked.
+
+@subsubsection S5_1_5_1_guidelines_for_api Guidelines for API
+
+- When adding new arguments, consider grouping arguments (including the old ones) into a struct rather than adding arguments with default values.
+Introducing a new struct might break the API/ABI once, but it will be helpful to keep the stability.
+- When new member variables are added, please make sure they are initialized.
+- Avoid adding enum elements in the middle.
+- When removing arguments, follow the deprecation process described in the following section.
+- When changing behavior affecting API contracts, follow the deprecation process described in the following section.
+
+@subsubsection S5_1_5_2_guidelines_for_abi Guidelines for ABI
+
+We recommend to read through <a href="https://community.kde.org/Policies/Binary_Compatibility_Issues_With_C%2B%2B">this page</a>
+and double check your contributions to see if they include the changes listed.
+
+Also, for classes that requires strong ABI stability, consider using <a href="https://en.cppreference.com/w/cpp/language/pimpl">pImpl idiom</a>.
+
+@subsubsection S5_1_5_3_api_deprecation_process API deprecation process
+
+In order to deprecate an existing API, these rules should be followed.
+
+- Removal of a deprecated API should wait at least for one official release.
+- Deprecation of runtime APIs should strictly follow the aforementioned period, whereas core APIs can have more flexibility as they are mostly used internally rather than user-facing.
+- Any API changes (update, addition and deprecation) in all components should be well documented by the contribution itself.
+
+Also, it is recommended to use the following utility macros which is designed to work with both clang and gcc using C++11 and later.
+
+- ARM_COMPUTE_DEPRECATED: Just deprecate the wrapped function
+- ARM_COMPUTE_DEPRECATED_REL: Deprecate the wrapped function and also capture the release that was deprecated
+- ARM_COMPUTE_DEPRECATED_REL_REPLACE: Deprecate the wrapped function and also capture the release that was deprecated along with a possible replacement candidate
+
+@code{.cpp}
+ARM_COMPUTE_DEPRECATED_REL_REPLACE(20.08, DoNewThing)
+void DoOldThing();
+
+void DoNewThing();
+@endcode
+
 @section S5_2_how_to_submit_a_patch How to submit a patch
 
 To be able to submit a patch to our development repository you need to have a GitHub account. With that, you will be able to sign in to Gerrit where your patch will be reviewed.

diff --git a/docs/06_functions_list.dox b/docs/06_functions_list.dox
index ac94461..c8006c6 100644
--- a/docs/06_functions_list.dox
+++ b/docs/06_functions_list.dox

@@ -54,6 +54,9 @@
         - @ref NEExpLayer
         - @ref NEGaussian3x3
         - @ref NEIntegralImage
+        - @ref NELogicalAnd
+        - @ref NELogicalNot
+        - @ref NELogicalOr
         - @ref NEMedian3x3
         - @ref NENonLinearFilter
         - @ref NENonMaximaSuppression3x3
@@ -141,8 +144,8 @@
         - @ref NEGaussianPyramidOrb
     - @ref NEGEMM
     - @ref NEGEMMAssemblyDispatch
+    - @ref NEGEMMConv2d
     - @ref NEGEMMConvolutionLayer
-    - @ref NEGEMMLowpAssemblyMatrixMultiplyCore
     - @ref NEGEMMLowpMatrixMultiplyCore
     - @ref NEGenerateProposalsLayer
     - @ref NEHarrisCorners
@@ -173,7 +176,6 @@
     - @ref NERNNLayer
     - @ref NEROIPoolingLayer
     - @ref NEScale
-    - @ref NESimpleAssemblyFunction
     - @ref NESobel5x5
     - @ref NESobel7x7
     - @ref NESoftmaxLayerGeneric &lt;IS_LOG&gt;
@@ -231,6 +233,9 @@
     - @ref CLLaplacianPyramid
     - @ref CLLaplacianReconstruct
     - @ref CLLocallyConnectedLayer
+    - @ref CLLogicalAnd
+    - @ref CLLogicalNot
+    - @ref CLLogicalOr
     - @ref CLLSTMLayer
     - @ref CLLSTMLayerQuantized
     - @ref CLQLSTMLayer
@@ -300,6 +305,7 @@
         - @ref CLGaussian3x3
         - @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
         - @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
+        - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
         - @ref CLMagnitude
         - @ref CLMeanStdDevNormalizationLayer
         - @ref CLMedian3x3

diff --git a/docs/07_errata.dox b/docs/07_errata.dox
index 2d35e67..994b8c5 100644
--- a/docs/07_errata.dox
+++ b/docs/07_errata.dox

@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2019 Arm Limited.
+/// Copyright (c) 2019-2020 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -30,6 +30,11 @@
 
 @section S7_1_errata Errata
 
+- Under certain conditions, the validation test case 'CL/DirectConvolutionLayer/Float/FP32/RunSmall9x9\@InputShape=32x37x3x4:StrideX=1:StrideY=1:PadX=0:PadY=0:KernelSize=9:NumKernels=1:DataType=F32:ActivationInfo=LU_BOUNDED_RELU:DataLayout=NHWC' may fail.
+    - Versions Affected: >= v20.08
+    - Conditions:
+        - The validation suite has to run in nightly mode and execute 40k+ test cases before the test mentioned above
+
 - Under certain conditions, benchmark examples can hang when OpenCL profiling queues are enabled.
     - Versions Affected: >= v19.11
     - OSs Affected: Linux

diff --git a/docs/ComputeLibrary.dir b/docs/ComputeLibrary.dir
new file mode 100644
index 0000000..7733e53
--- /dev/null
+++ b/docs/ComputeLibrary.dir

@@ -0,0 +1,360 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+/** @file Android.bp
+ *  @brief Generation script for building AndroidNN driver.
+ */
+
+/** @dir arm_compute
+ *  @brief All the arm_compute headers.
+ */
+
+/** @dir arm_compute/core
+ *  @brief Core module: common basic types and kernels.
+ */
+
+/** @dir arm_compute/core/CL
+ *  @brief OpenCL backend core: kernels and utilities.
+ */
+
+/** @file arm_compute/core/CL/CLKernelLibrary.h
+ *  @brief Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
+ */
+
+/** @file arm_compute/core/CL/OpenCL.h
+ *  @brief Wrapper to configure the Khronos OpenCL C++ header
+ */
+
+/** @dir arm_compute/core/CPP
+ *  @brief CPP backend core: kernels and utilities.
+ */
+
+/** @file arm_compute/core/CPP/CPPKernels.h
+ *  @brief Includes all the CPP kernels at once
+ */
+
+/** @dir arm_compute/core/CPP/kernels
+ *  @brief Folder containing all the CPP kernels
+ */
+
+/** @dir arm_compute/core/experimental
+ *  @brief All experimental interfaces
+ */
+
+/** @dir arm_compute/core/GLES_COMPUTE
+ *  @brief OpenGLES backend core: kernels and utilities.
+ */
+
+/** @file arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
+ *  @brief Manages all the GLES kernels compilation and caching, provides accessors for the GLES Context.
+ */
+
+/** @file arm_compute/core/GLES_COMPUTE/GCKernels.h
+ *  @brief Includes all the GLES kernels at once
+ */
+
+/** @file arm_compute/core/GLES_COMPUTE/OpenGLES.h
+ *  @brief Wrapper to configure the Khronos EGL and OpenGL ES C header
+ */
+
+/** @dir arm_compute/core/GLES_COMPUTE/kernels
+ *  @brief Folder containing all the GLES kernels
+ */
+
+/** @dir src/core/NEON
+ *  @brief NEON backend core: kernels and utilities.
+ */
+
+/** @file src/core/NEON/NEKernels.h
+ *  @brief Includes all the NEON kernels at once
+ */
+
+/** @dir src/core/NEON/kernels
+ *  @brief Folder containing all the NEON kernels
+ */
+
+/** @dir arm_compute/core/utils
+ *  @brief Common core utilities.
+ */
+
+/** @dir arm_compute/graph
+ *  @brief Graph API.
+ */
+
+/** @dir arm_compute/graph/algorithms
+ *  @brief Generic algorithms used by the graph backend (e.g Order of traversal)
+ */
+
+/** @dir arm_compute/graph/backends
+ *  @brief The backend specific code
+ */
+
+/** @dir arm_compute/graph/backends/CL
+ *  @brief OpenCL specific operations
+ */
+
+/** @dir arm_compute/graph/backends/GLES
+ *  @brief OpenGLES specific operations
+ */
+
+/** @dir arm_compute/graph/backends/NEON
+ *  @brief NEON specific operations
+ */
+
+/** @dir arm_compute/graph/detail
+ *  @brief Collection of internal utilities.
+ */
+
+/** @dir arm_compute/graph/frontend
+ *  @brief Code related to the stream frontend interface.
+ */
+
+/** @dir arm_compute/graph/mutators
+ *  @brief Used to modify / optimise the Graph intermediate representation(Operator fusion, in place operations, etc.)
+ */
+
+/** @dir arm_compute/graph/nodes
+ *  @brief The various nodes supported by the graph API.
+ */
+
+/** @dir arm_compute/graph/printers
+ *  @brief Debug printers.
+ */
+
+/** @file arm_compute/graph.h
+ *  @brief Includes all the Graph headers at once.
+ */
+
+/** @dir arm_compute/runtime
+ *  @brief Runtime interface: memory, scheduler, functions.
+ */
+
+/** @dir arm_compute/runtime/CL
+ *  @brief OpenCL backend runtime interface.
+ */
+
+/** @file arm_compute/runtime/CL/CLFunctions.h
+ *  @brief Includes all the OpenCL functions at once
+ */
+
+/** @file arm_compute/runtime/CL/CLScheduler.h
+ *  @brief Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
+ */
+
+/** @file arm_compute/runtime/CL/ICLTuner.h
+ *  @brief Interface used to tune the local work-group size of OpenCL kernels.
+ */
+
+/** @dir arm_compute/runtime/CL/functions
+ *  @brief Folder containing all the OpenCL functions.
+ */
+
+/** @dir arm_compute/runtime/CL/tuners
+ *  @brief Local workgroup size tuners for specific architectures / GPUs.
+ */
+
+/** @dir arm_compute/runtime/CPP
+ *  @brief CPP backend runtime interface.
+ */
+
+/** @file arm_compute/runtime/CPP/CPPScheduler.h
+ *  @brief Basic pool of threads to execute CPP/NEON code on several cores in parallel.
+ */
+
+/** @dir arm_compute/runtime/CPP/functions
+ *  @brief Folder containing all the CPP functions.
+ */
+
+/** @dir arm_compute/runtime/experimental
+ *  @brief Experimental runtime interface.
+ */
+
+/** @dir arm_compute/runtime/GLES_COMPUTE
+ *  @brief OpenGLES backend runtime interface.
+ */
+
+/** @file arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
+ *  @brief Includes all the OpenGLES functions at once
+ */
+
+/** @file arm_compute/runtime/GLES_COMPUTE/GCScheduler.h
+ *  @brief Interface to enqueue GLES kernels and get/set the GLES CommandQueue.
+ */
+
+/** @dir arm_compute/runtime/GLES_COMPUTE/functions
+ *  @brief Folder containing all the GLES functions.
+ */
+
+/** @dir arm_compute/runtime/NEON
+ *  @brief NEON backend runtime interface.
+ */
+
+/** @file arm_compute/runtime/NEON/NEFunctions.h
+ *  @brief Includes all the NEON functions at once.
+ */
+
+/** @dir arm_compute/runtime/NEON/functions
+ *  @brief Folder containing all the NEON functions.
+ */
+
+/** @dir arm_compute/runtime/OMP
+ *  @brief OpenMP backend runtime interface.
+ */
+
+/** @file arm_compute/runtime/OMP/OMPScheduler.h
+ *  @brief OpenMP scheduler (Alternative to the CPPScheduler).
+ */
+
+/** @dir arm_compute/runtime/common
+ *  @brief Common utility code used by all backends.
+ */
+
+/** @dir docs
+ *  @brief Doxyfile and Doxygen sources used to generate this documentation.
+ */
+
+/** @dir ./examples
+ *  @brief Set of examples using the Compute Library
+ *
+ *  @details Examples have the following structure:
+ *
+ *  -# cl_*.cpp --> OpenCL examples
+ *  -# gc_*.cpp --> GLES compute shaders examples
+ *  -# graph_*.cpp --> Graph examples
+ *  -# neoncl_*.cpp --> NEON / OpenCL interoperability examples
+ *  -# neon_*.cpp --> NEON examples
+ */
+
+/** @dir examples/gemm_tuner
+ *  @brief OpenCL GEMM tuner utility.
+ */
+
+/** @dir scripts
+ *  @brief Utility scripts.
+ */
+
+/** @file scripts/caffe_data_extractor.py
+ *  @brief Basic script to export weights from Caffe to npy files.
+ */
+
+/** @file scripts/tensorflow_data_extractor.py
+ *  @brief Basic script to export weights from TensorFlow to npy files.
+ */
+
+/** @dir src
+ *  @brief Source code implementing all the arm_compute headers.
+ */
+
+/** @dir src/core/NEON/kernels/detail
+ *  @brief Common code for several intrinsics implementations.
+ */
+
+/** @dir src/core/NEON/wrapper
+ *  @brief NEON wrapper used to simplify code
+ */
+
+/** @file src/core/NEON/wrapper/traits.h
+ *  @brief Traits defined on NEON vectors
+ */
+
+/** @file src/core/NEON/wrapper/wrapper.h
+ *  @brief Includes all wrapper headers at once
+ */
+
+/** @dir src/core/NEON/wrapper/intrinsics
+ *  @brief NEON intrinsics wrappers
+ */
+
+/** @dir src/core/NEON/wrapper/scalar
+ *  @brief Scalar operations
+ */
+
+/** @dir src/core/CL/gemm
+ *  @brief Folder containing all the configuration files for GEMM
+ */
+
+/** @dir src/core/CL/kernels
+ *  @brief All the OpenCL kernels
+ */
+
+/** @dir support
+ *  @brief Various headers to work around toolchains / platform issues.
+ */
+
+/** @dir tests
+ *  @brief All test related files shared between validation and benchmark.
+ */
+
+/** @file tests/main.cpp
+ *  @brief Main entry point for the tests. Currently shared between validation and benchmarking.
+ */
+
+/** @dir tests/CL
+ *  @brief OpenCL accessors.
+ */
+
+/** @dir tests/GLES_COMPUTE
+ *  @brief GLES accessors.
+ */
+
+/** @dir tests/NEON
+ *  @brief NEON accessors.
+ */
+
+/** @dir tests/benchmark
+ *  @brief Sources for benchmarking.
+ */
+
+/** @dir tests/benchmark/CL
+ *  @brief OpenCL benchmarking tests.
+ */
+
+/** @dir tests/benchmark/GLES_COMPUTE
+ *  @brief GLES benchmarking tests.
+ */
+
+/** @dir tests/benchmark/NEON
+ *  @brief NEON benchmarking tests.
+ */
+
+/** @dir tests/benchmark_examples
+ *  @brief Sources needed to wrap examples to run through our benchmarking framework.
+ */
+
+/** @dir tests/framework
+ *  @brief Boiler plate code for both validation and benchmark test suites (Command line parsers, instruments, output loggers, etc.)
+ */
+
+/** @dir tests/instruments
+ *  @brief User defined instruments that can be registered to the framework.
+ */
+
+/** @dir tests/validate_examples
+ *  @brief Sources needed to wrap examples to run through our validation framework.
+ */
+
+/** @dir tests/validation
+ *  @brief Source for validation.
+ */
+
+/** @dir tests/validation/CL
+ *  @brief OpenCL validation tests.
+ */
+
+/** @dir tests/validation/CPP
+ *  @brief C++ validation tests.
+ */
+
+/** @dir tests/validation/GLES_COMPUTE
+ *  @brief GLES validation tests.
+ */
+
+/** @dir tests/validation/NEON
+ *  @brief NEON validation tests.
+ */
+
+/** @dir tests/validation/reference
+ *  @brief Reference implementation used to validate the results of the various backends.
+ */

diff --git a/docs/Doxyfile b/docs/Doxyfile
index ef8966c..323ed21 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile

@@ -38,7 +38,7 @@
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 20.08
+PROJECT_NUMBER         = 20.11
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -291,7 +291,7 @@
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING      = cl=C
+EXTENSION_MAPPING      = cl=C bp=C dir=C
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -776,11 +776,14 @@
                          ./docs/05_contribution_guidelines.dox \
                          ./docs/06_functions_list.dox \
                          ./docs/07_errata.dox \
+                         ./docs/ComputeLibrary.dir \
                          ./arm_compute/ \
                          ./src/ \
                          ./examples/ \
                          ./tests/ \
                          ./utils/ \
+                         ./Android.bp \
+                         ./scripts \
                          ./support/
 
 # This tag can be used to specify the character encoding of the source files
@@ -843,6 +846,8 @@
                          *.qsf \
                          *.as \
                          *.js \
+                         *.bp \
+                         *.dir \
                          *.cl
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
commit	49b8f9080cf2a24da986b6f156c7418ee3d28478	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Fri Nov 27 12:49:11 2020 +0000
committer	Jenkins <bsgcomp@arm.com>	Fri Nov 27 12:49:11 2020 +0000
tree	55edcdfaa1ab1283bdff44e5b7c2c473cd11fbac
parent	18b685f5d09ba07aa54e050f881a0befe3e36047 [diff]