arm_compute v18.05

commit: b3a371bc429d2ba45e56baaf239d8200c2662a74 [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Wed May 23 11:36:53 2018 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Wed May 23 14:55:11 2018 +0100
tree: 554525e415c303d64a08722a755397852ebbb8e4
parent: 67c8c91522e5be8156b77f57e63c0253535c902a [diff] [blame]
diff --git a/documentation/index.xhtml b/documentation/index.xhtml
index 91482da..659b007 100644
--- a/documentation/index.xhtml
+++ b/documentation/index.xhtml

@@ -40,7 +40,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">Compute Library
-   &#160;<span id="projectnumber">18.03</span>
+   &#160;<span id="projectnumber">18.05</span>
    </div>
   </td>
  </tr>
@@ -199,20 +199,34 @@
 │   │   │   └── OpenGLES.h --&gt; Wrapper to configure the Khronos EGL and OpenGL ES C header
 │   │   ├── NEON
 │   │   │   ├── kernels --&gt; Folder containing all the NEON kernels
-│   │   │   │   ├── arm64 --&gt; Folder containing the interfaces for the assembly arm64 NEON kernels
-│   │   │   │   ├── arm32 --&gt; Folder containing the interfaces for the assembly arm32 NEON kernels
-│   │   │   │   ├── assembly --&gt; Folder containing the NEON assembly routines.
+│   │   │   │   ├── assembly --&gt; headers for assembly optimised NEON kernels.
+│   │   │   │   ├── convolution --&gt; headers for convolution assembly optimised NEON kernels.
+│   │   │   │   │   ├── common --&gt; headers for code which is common to several convolution implementations.
+│   │   │   │   │   ├── depthwise --&gt; headers for Depthwise convolultion assembly implementation
+│   │   │   │   │   └── winograd --&gt; headers for Winograd convolution assembly implementation
+│   │   │   │   ├── detail --&gt; Common code for several intrinsics implementations.
 │   │   │   │   └── NE*Kernel.h
 │   │   │   └── NEKernels.h --&gt; Includes all the NEON kernels at once
 │   │   ├── All common basic types (Types.h, Window, Coordinates, Iterator, etc.)
 │   │   ├── All generic objects interfaces (ITensor, IImage, etc.)
 │   │   └── Objects metadata classes (ImageInfo, TensorInfo, MultiImageInfo)
 │   ├── graph
-│   │   ├── CL --&gt; OpenCL specific operations
-│   │   │   └── CLMap.h / CLUnmap.h
+│   │   ├── algorithms
+│   │   │   └── Generic algorithms used by the graph backend (e.g Order of traversal)
+│   │   ├── backends --&gt; The backend specific code
+│   │   │   ├── CL --&gt; OpenCL specific operations
+│   │   │   ├── GLES  --&gt; OpenGLES Compute Shaders specific operations
+│   │   │   └── NEON --&gt; NEON specific operations
+│   │   ├── detail
+│   │   │   └── Collection of internal utilities.
+│   │   ├── frontend
+│   │   │   └── Code related to the stream frontend interface.
+│   │   ├── mutators
+│   │   │   └── Used to modify / optimise the Graph intermediate representation(Operator fusion, in place operations, etc.)
 │   │   ├── nodes
 │   │   │   └── The various nodes supported by the graph API
-│   │   ├── Nodes.h --&gt; Includes all the Graph nodes at once.
+│   │   ├── printers
+│   │   │   └── Debug printers
 │   │   └── Graph objects ( INode, ITensorAccessor, Graph, etc.)
 │   └── runtime
 │       ├── CL
@@ -220,10 +234,14 @@
 │       │   ├── functions --&gt; Folder containing all the OpenCL functions
 │       │   │   └── CL*.h
 │       │   ├── CLScheduler.h --&gt; Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
-│       │   └── CLFunctions.h --&gt; Includes all the OpenCL functions at once
+│       │   ├── CLFunctions.h --&gt; Includes all the OpenCL functions at once
+│       │   └── tuners
+│       │       └── Local workgroup size tuners for specific architectures / GPUs
 │       ├── CPP
 │       │   ├── CPPKernels.h --&gt; Includes all the CPP functions at once.
-│       │   └── CPPScheduler.h --&gt; Basic pool of threads to execute CPP/NEON code on several cores in parallel
+│       │   ├── CPPScheduler.h --&gt; Basic pool of threads to execute CPP/NEON code on several cores in parallel
+│       │   └── functions --&gt; Folder containing all the CPP functions
+│       │       └── CPP*.h
 │       ├── GLES_COMPUTE
 │       │   ├── GLES objects &amp; allocators (GCArray, GCImage, GCTensor, etc.)
 │       │   ├── functions --&gt; Folder containing all the GLES functions
@@ -250,6 +268,7 @@
 │   ├── graph_*.cpp --&gt; Graph examples
 │   ├── neoncl_*.cpp --&gt; NEON / OpenCL interoperability examples
 │   └── neon_*.cpp --&gt; NEON examples
+├── graph.h --&gt; Includes all the Graph headers at once.
 ├── include
 │   ├── CL
 │   │   └── Khronos OpenCL C headers and C++ wrapper
@@ -280,31 +299,32 @@
 │   └── Various headers to work around toolchains / platform issues.
 ├── tests
 │   ├── All test related files shared between validation and benchmark
+│   ├── benchmark --&gt; Sources for benchmarking
+│   │   ├── Benchmark specific files
+│   │   ├── fixtures
+│   │   │   └── Backend agnostic fixtures to initialise and run the functions to test.
+│   │   ├── CL --&gt; OpenCL benchmarking tests
+│   │   ├── GLES_COMPUTE --&gt; GLES benchmarking tests
+│   │   └── NEON --&gt; NEON benchmarking tests
 │   ├── CL --&gt; OpenCL accessors
 │   ├── GLES_COMPUTE --&gt; GLES accessors
 │   ├── NEON --&gt; NEON accessors
-│   ├── benchmark --&gt; Sources for benchmarking
-│   │   ├── Benchmark specific files
-│   │   ├── CL --&gt; OpenCL benchmarking tests
-│   │   ├── GLES_COMPUTE --&gt; GLES benchmarking tests
-│   │   ├── fixtures
-│   │   │   └── Fixtures to initialise and run the runtime Functions.
-│   │   └── NEON --&gt; NEON benchmarking tests
 │   ├── datasets
 │   │   └── Datasets for all the validation / benchmark tests, layer configurations for various networks, etc.
 │   ├── framework
 │   │   └── Boiler plate code for both validation and benchmark test suites (Command line parsers, instruments, output loggers, etc.)
 │   ├── networks
 │   │   └── Examples of how to instantiate networks.
-│   ├── validation --&gt; Sources for validation
-│   │   ├── Validation specific files
-│   │   ├── CL --&gt; OpenCL validation tests
-│   │   ├── GLES_COMPUTE --&gt; GLES validation tests
-│   │   ├── CPP --&gt; C++ reference implementations
-│   │   ├── fixtures
-│   │   │   └── Fixtures to initialise and run the runtime Functions.
-│   │   └── NEON --&gt; NEON validation tests
-│   └── dataset --&gt; Datasets defining common sets of input parameters
+│   └── validation --&gt; Sources for validation
+│       ├── Validation specific files
+│       ├── fixtures
+│       │   └── Backend agnostic fixtures to initialise and run the functions to test.
+│       ├── reference
+│       │   └── Reference implementation used to validate the results of the various backends.
+│       ├── CL --&gt; OpenCL validation tests
+│       ├── GLES_COMPUTE --&gt; GLES validation tests
+│       ├── CPP --&gt; C++ reference implementations
+│       └── NEON --&gt; NEON validation tests
 └── utils --&gt; Boiler plate code used by examples
     └── Various utilities to print types, load / store assets, etc.
 </pre><h1><a class="anchor" id="S2_versions_changelog"></a>
@@ -317,6 +337,72 @@
 </pre><dl class="section note"><dt>Note</dt><dd>We're aiming at releasing one major public release with new features per quarter. All releases in between will only contain bug fixes.</dd></dl>
 <h2><a class="anchor" id="S2_2_changelog"></a>
 Changelog</h2>
+<p>v18.05 Public maintenance release</p><ul>
+<li>Various bug fixes.</li>
+<li>Various optimisations.</li>
+<li>Major redesign in the interface for the neon kernels implemented in assembly.</li>
+<li>Removed arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_assembly_matrix_multiply_core.xhtml" title="Basic function to execute matrix multiply assembly kernels. ">arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore</a> / arm_compute::NEHGEMMAArch64FP16Kernel</li>
+<li>Added NEGEMMAssemblyWrapper and <a class="el" href="classarm__compute_1_1_assembly_kernel_glue.xhtml" title="Assembly kernel glue. ">AssemblyKernelGlue</a> which are used to execute assembly kernels in neon functions.</li>
+<li>Minor changes to the <a class="el" href="classarm__compute_1_1_c_p_u_info.xhtml">CPUInfo</a> type to make it compatible with the new assembly gemm interface.</li>
+<li>Moved neon assembly kernels to the folder src/core/NEON/kernels/arm_gemm.</li>
+<li>Improved doxygen documentation.</li>
+<li>Improved memory management for layer's transitions.</li>
+<li>Added support for NHWC data layout in tensors.</li>
+<li>Added NHWC data layout support to:<ul>
+<li><a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_convolution_layer.xhtml">NEGEMMConvolutionLayer</a></li>
+<li><a class="el" href="classarm__compute_1_1_n_e_direct_convolution_layer.xhtml">NEDirectConvolutionLayer</a></li>
+<li><a class="el" href="classarm__compute_1_1_n_e_pooling_layer.xhtml">NEPoolingLayer</a> / <a class="el" href="classarm__compute_1_1_c_l_pooling_layer.xhtml">CLPoolingLayer</a></li>
+<li><a class="el" href="classarm__compute_1_1_n_e_batch_normalization_layer.xhtml">NEBatchNormalizationLayer</a> / <a class="el" href="classarm__compute_1_1_c_l_batch_normalization_layer.xhtml">CLBatchNormalizationLayer</a></li>
+<li><a class="el" href="classarm__compute_1_1_n_e_depthwise_convolution_layer.xhtml">NEDepthwiseConvolutionLayer</a></li>
+<li><a class="el" href="classarm__compute_1_1_n_e_scale.xhtml">NEScale</a></li>
+<li><a class="el" href="classarm__compute_1_1_n_e_im2_col.xhtml">NEIm2Col</a></li>
+</ul>
+</li>
+<li>Added support for dilated convolutions in <a class="el" href="classarm__compute_1_1_n_e_convolution_layer.xhtml">NEConvolutionLayer</a> and <a class="el" href="classarm__compute_1_1_c_l_convolution_layer.xhtml">CLConvolutionLayer</a>.</li>
+<li>New OpenCL kernels / functions:<ul>
+<li><a class="el" href="classarm__compute_1_1_c_l_channel_shuffle_layer.xhtml">CLChannelShuffleLayer</a> / <a class="el" href="classarm__compute_1_1_c_l_channel_shuffle_layer_kernel.xhtml">CLChannelShuffleLayerKernel</a></li>
+<li><a class="el" href="classarm__compute_1_1_c_l_convert_fully_connected_weights_kernel.xhtml">CLConvertFullyConnectedWeightsKernel</a> / <a class="el" href="classarm__compute_1_1_c_l_convert_fully_connected_weights.xhtml">CLConvertFullyConnectedWeights</a></li>
+<li><a class="el" href="classarm__compute_1_1_c_l_copy.xhtml">CLCopy</a> / <a class="el" href="classarm__compute_1_1_c_l_copy_kernel.xhtml">CLCopyKernel</a></li>
+<li><a class="el" href="classarm__compute_1_1_c_l_l_s_t_m_layer.xhtml">CLLSTMLayer</a></li>
+<li><a class="el" href="classarm__compute_1_1_c_l_r_n_n_layer.xhtml">CLRNNLayer</a></li>
+<li><a class="el" href="classarm__compute_1_1_c_l_width_concatenate_layer.xhtml">CLWidthConcatenateLayer</a> / <a class="el" href="classarm__compute_1_1_c_l_width_concatenate_layer_kernel.xhtml">CLWidthConcatenateLayerKernel</a></li>
+<li><a class="el" href="classarm__compute_1_1_c_l_winograd_filter_transform_kernel.xhtml">CLWinogradFilterTransformKernel</a> / <a class="el" href="classarm__compute_1_1_c_l_winograd_input_transform_kernel.xhtml">CLWinogradInputTransformKernel</a> / <a class="el" href="classarm__compute_1_1_c_l_winograd_convolution_layer.xhtml">CLWinogradConvolutionLayer</a></li>
+<li><a class="el" href="classarm__compute_1_1_c_l_winograd_input_transform_kernel.xhtml">CLWinogradInputTransformKernel</a> / <a class="el" href="classarm__compute_1_1_c_l_winograd_input_transform.xhtml">CLWinogradInputTransform</a></li>
+</ul>
+</li>
+<li>New Neon kernels / functions:<ul>
+<li><a class="el" href="classarm__compute_1_1_c_l_r_n_n_layer.xhtml">CLRNNLayer</a></li>
+<li><a class="el" href="classarm__compute_1_1_n_e_convert_fully_connected_weights_kernel.xhtml">NEConvertFullyConnectedWeightsKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_convert_fully_connected_weights.xhtml">NEConvertFullyConnectedWeights</a>.</li>
+</ul>
+</li>
+<li>Created the validate method in <a class="el" href="classarm__compute_1_1_c_l_depthwise_convolution_layer.xhtml">CLDepthwiseConvolutionLayer</a>.</li>
+<li>Beta and gamma are no longer mandatory arguments in <a class="el" href="classarm__compute_1_1_n_e_batch_normalization_layer.xhtml">NEBatchNormalizationLayer</a> and <a class="el" href="classarm__compute_1_1_c_l_batch_normalization_layer.xhtml">CLBatchNormalizationLayer</a>.</li>
+<li>Added depth multiplier support in <a class="el" href="classarm__compute_1_1_n_e_depthwise_convolution_layer.xhtml">NEDepthwiseConvolutionLayer</a> and <a class="el" href="classarm__compute_1_1_c_l_depthwise_convolution_layer.xhtml">CLDepthwiseConvolutionLayer</a>.</li>
+<li>Added broadcast multiply support in <a class="el" href="classarm__compute_1_1_n_e_pixel_wise_multiplication.xhtml">NEPixelWiseMultiplication</a> / <a class="el" href="classarm__compute_1_1_n_e_pixel_wise_multiplication_kernel.xhtml">NEPixelWiseMultiplicationKernel</a>.</li>
+<li>Port mobilenet example to NHWC data layout.</li>
+<li>Enabled Winograd method in <a class="el" href="classarm__compute_1_1_c_l_convolution_layer.xhtml">CLConvolutionLayer</a>.</li>
+<li>Renamed NEWinogradLayer to <a class="el" href="classarm__compute_1_1_n_e_winograd_convolution_layer.xhtml">NEWinogradConvolutionLayer</a>.</li>
+<li>Updated <a class="el" href="classarm__compute_1_1_n_e_winograd_convolution_layer.xhtml">NEWinogradConvolutionLayer</a> to use highly optimised assembly kernels in src/core/NEON/kernels/arm_gemm.</li>
+<li>Added memory manager support in GLES functions.</li>
+<li>Major refactoring of the graph API.</li>
+<li>Added GLES backend in the graph API.</li>
+<li>Added support for the memory manager in the graph API.</li>
+<li>Enabled Winograd Convolution method in the graph API.</li>
+<li>Added support for grouped convolutions in the graph API.</li>
+<li>Replaced NEDeconvolutionLayerUpsampleKernel with <a class="el" href="classarm__compute_1_1_n_e_scale_kernel.xhtml">NEScaleKernel</a> in <a class="el" href="classarm__compute_1_1_n_e_deconvolution_layer.xhtml">NEDeconvolutionLayer</a>.</li>
+<li>Added fast maths flag in <a class="el" href="classarm__compute_1_1_c_l_convolution_layer.xhtml">CLConvolutionLayer</a>.</li>
+<li>Added new tests and benchmarks in validation and benchmark frameworks</li>
+<li>Merge Activation layer with Convolution Layer (NEON. CL, GLES)</li>
+<li>Added support to OpenCL 2.0 SVM</li>
+<li>Added support to import memory in OpenCL tensors.</li>
+<li>Added the prepare() method to perform any one off pre-processing before running the function.</li>
+<li>Added new examples:<ul>
+<li><a class="el" href="graph__inception__v4_8cpp.xhtml">graph_inception_v4.cpp</a></li>
+<li><a class="el" href="graph__resnext50_8cpp.xhtml">graph_resnext50.cpp</a></li>
+</ul>
+</li>
+<li>Added memory measurement instrument for CL.</li>
+</ul>
 <p>v18.03 Public maintenance release</p><ul>
 <li>Various bug fixes.</li>
 <li>Fixed bug in <a class="el" href="classarm__compute_1_1_n_e_activation_layer.xhtml">NEActivationLayer</a></li>
@@ -324,6 +410,7 @@
 <li>Updated recommended NDK version to r16b (And fixed warnings).</li>
 <li>Fixed bug in validation code.</li>
 <li>Added Inception v4 graph example.</li>
+<li>Renamed NEWinogradLayer.cpp to <a class="el" href="classarm__compute_1_1_n_e_winograd_convolution_layer.xhtml">NEWinogradConvolutionLayer</a></li>
 </ul>
 <p>v18.02 Public major release</p><ul>
 <li>Various NEON / OpenCL / GLES optimisations.</li>
@@ -365,9 +452,9 @@
 <li>Added name() method to all kernels.</li>
 <li>Added support for Winograd 5x5.</li>
 <li><a class="el" href="classarm__compute_1_1_n_e_permute_kernel.xhtml">NEPermuteKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_permute.xhtml">NEPermute</a></li>
-<li><a class="el" href="classarm__compute_1_1_n_e_winograd_layer_transform_input_kernel.xhtml">NEWinogradLayerTransformInputKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_winograd_layer.xhtml">NEWinogradLayer</a></li>
-<li><a class="el" href="classarm__compute_1_1_n_e_winograd_layer_transform_output_kernel.xhtml">NEWinogradLayerTransformOutputKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_winograd_layer.xhtml">NEWinogradLayer</a></li>
-<li><a class="el" href="classarm__compute_1_1_n_e_winograd_layer_transform_weights_kernel.xhtml">NEWinogradLayerTransformWeightsKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_winograd_layer.xhtml">NEWinogradLayer</a></li>
+<li><a class="el" href="classarm__compute_1_1_n_e_winograd_layer_transform_input_kernel.xhtml">NEWinogradLayerTransformInputKernel</a> / NEWinogradLayer</li>
+<li><a class="el" href="classarm__compute_1_1_n_e_winograd_layer_transform_output_kernel.xhtml">NEWinogradLayerTransformOutputKernel</a> / NEWinogradLayer</li>
+<li><a class="el" href="classarm__compute_1_1_n_e_winograd_layer_transform_weights_kernel.xhtml">NEWinogradLayerTransformWeightsKernel</a> / NEWinogradLayer</li>
 <li>Renamed NEWinogradLayerKernel into <a class="el" href="classarm__compute_1_1_n_e_winograd_layer_batched_g_e_m_m_kernel.xhtml">NEWinogradLayerBatchedGEMMKernel</a></li>
 </ul>
 </li>
@@ -443,13 +530,13 @@
 </ul>
 </li>
 <li>New NEON kernels / functions<ul>
-<li><a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_a_arch64_a53_kernel.xhtml">NEGEMMLowpAArch64A53Kernel</a> / <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_a_arch64_kernel.xhtml">NEGEMMLowpAArch64Kernel</a> / <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_a_arch64_v8_p4_kernel.xhtml">NEGEMMLowpAArch64V8P4Kernel</a> / NEGEMMInterleavedBlockedKernel / <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_assembly_matrix_multiply_core.xhtml">NEGEMMLowpAssemblyMatrixMultiplyCore</a></li>
-<li><a class="el" href="classarm__compute_1_1_n_e_h_g_e_m_m_a_arch64_f_p16_kernel.xhtml">NEHGEMMAArch64FP16Kernel</a></li>
+<li>arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_assembly_matrix_multiply_core.xhtml" title="Basic function to execute matrix multiply assembly kernels. ">arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore</a></li>
+<li>arm_compute::NEHGEMMAArch64FP16Kernel</li>
 <li><a class="el" href="classarm__compute_1_1_n_e_depthwise_convolution_layer3x3_kernel.xhtml">NEDepthwiseConvolutionLayer3x3Kernel</a> / <a class="el" href="classarm__compute_1_1_n_e_depthwise_im2_col_kernel.xhtml">NEDepthwiseIm2ColKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_matrix_vector_multiply_kernel.xhtml">NEGEMMMatrixVectorMultiplyKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_depthwise_vector_to_tensor_kernel.xhtml">NEDepthwiseVectorToTensorKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_depthwise_convolution_layer.xhtml">NEDepthwiseConvolutionLayer</a></li>
 <li><a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_offset_contribution_kernel.xhtml">NEGEMMLowpOffsetContributionKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_matrix_a_reduction_kernel.xhtml">NEGEMMLowpMatrixAReductionKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_matrix_b_reduction_kernel.xhtml">NEGEMMLowpMatrixBReductionKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_matrix_multiply_core.xhtml">NEGEMMLowpMatrixMultiplyCore</a></li>
 <li><a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_quantize_down_int32_to_uint8_scale_by_fixed_point_kernel.xhtml">NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_quantize_down_int32_to_uint8_scale_by_fixed_point.xhtml">NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint</a></li>
 <li><a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_quantize_down_int32_to_uint8_scale_kernel.xhtml">NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_lowp_quantize_down_int32_to_uint8_scale.xhtml">NEGEMMLowpQuantizeDownInt32ToUint8Scale</a></li>
-<li><a class="el" href="classarm__compute_1_1_n_e_winograd_layer.xhtml">NEWinogradLayer</a> / NEWinogradLayerKernel</li>
+<li>NEWinogradLayer / NEWinogradLayerKernel</li>
 </ul>
 </li>
 <li>New OpenCL kernels / functions<ul>
@@ -459,13 +546,13 @@
 </ul>
 </li>
 <li>New graph nodes for NEON and OpenCL<ul>
-<li><a class="el" href="classarm__compute_1_1graph_1_1_branch_layer.xhtml">graph::BranchLayer</a></li>
-<li><a class="el" href="classarm__compute_1_1graph_1_1_depth_convert_layer.xhtml">graph::DepthConvertLayer</a></li>
-<li><a class="el" href="classarm__compute_1_1graph_1_1_depthwise_convolution_layer.xhtml">graph::DepthwiseConvolutionLayer</a></li>
-<li><a class="el" href="classarm__compute_1_1graph_1_1_dequantization_layer.xhtml">graph::DequantizationLayer</a></li>
-<li><a class="el" href="classarm__compute_1_1graph_1_1_flatten_layer.xhtml">graph::FlattenLayer</a></li>
-<li><a class="el" href="classarm__compute_1_1graph_1_1_quantization_layer.xhtml">graph::QuantizationLayer</a></li>
-<li><a class="el" href="classarm__compute_1_1graph_1_1_reshape_layer.xhtml">graph::ReshapeLayer</a></li>
+<li>graph::BranchLayer</li>
+<li>graph::DepthConvertLayer</li>
+<li>graph::DepthwiseConvolutionLayer</li>
+<li>graph::DequantizationLayer</li>
+<li>graph::FlattenLayer</li>
+<li>graph::QuantizationLayer</li>
+<li>graph::ReshapeLayer</li>
 </ul>
 </li>
 </ul>
@@ -488,7 +575,7 @@
 <li>New validation and benchmark frameworks (Boost and Google frameworks replaced by homemade framework).</li>
 <li>Most machine learning functions support both fixed point 8 and 16 bit (QS8, QS16) for both NEON and OpenCL.</li>
 <li>New NEON kernels / functions:<ul>
-<li><a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_assembly_base_kernel.xhtml">NEGEMMAssemblyBaseKernel</a> <a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_a_arch64_kernel.xhtml">NEGEMMAArch64Kernel</a></li>
+<li><a class="el" href="classarm__compute_1_1_n_e_g_e_m_m_assembly_base_kernel.xhtml" title="Base class for GEMM NEON kernels implemented in Assembly. ">arm_compute::NEGEMMAssemblyBaseKernel</a> arm_compute::NEGEMMAArch64Kernel</li>
 <li><a class="el" href="classarm__compute_1_1_n_e_dequantization_layer_kernel.xhtml">NEDequantizationLayerKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_dequantization_layer.xhtml">NEDequantizationLayer</a></li>
 <li><a class="el" href="classarm__compute_1_1_n_e_floor_kernel.xhtml">NEFloorKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_floor.xhtml">NEFloor</a></li>
 <li><a class="el" href="classarm__compute_1_1_n_e_l2_normalize_layer_kernel.xhtml">NEL2NormalizeLayerKernel</a> / <a class="el" href="classarm__compute_1_1_n_e_l2_normalize_layer.xhtml">NEL2NormalizeLayer</a></li>
@@ -499,7 +586,7 @@
 </ul>
 </li>
 <li>New OpenCL kernels / functions:<ul>
-<li><a class="el" href="classarm__compute_1_1_c_l_depthwise_convolution_layer3x3_kernel.xhtml">CLDepthwiseConvolutionLayer3x3Kernel</a> <a class="el" href="classarm__compute_1_1_c_l_depthwise_im2_col_kernel.xhtml">CLDepthwiseIm2ColKernel</a> <a class="el" href="classarm__compute_1_1_c_l_depthwise_vector_to_tensor_kernel.xhtml">CLDepthwiseVectorToTensorKernel</a> <a class="el" href="classarm__compute_1_1_c_l_depthwise_weights_reshape_kernel.xhtml">CLDepthwiseWeightsReshapeKernel</a> / <a class="el" href="classarm__compute_1_1_c_l_depthwise_convolution_layer3x3.xhtml">CLDepthwiseConvolutionLayer3x3</a> <a class="el" href="classarm__compute_1_1_c_l_depthwise_convolution_layer.xhtml">CLDepthwiseConvolutionLayer</a> <a class="el" href="classarm__compute_1_1_c_l_depthwise_separable_convolution_layer.xhtml">CLDepthwiseSeparableConvolutionLayer</a></li>
+<li><a class="el" href="classarm__compute_1_1_c_l_depthwise_convolution_layer3x3_n_c_h_w_kernel.xhtml">CLDepthwiseConvolutionLayer3x3NCHWKernel</a> <a class="el" href="classarm__compute_1_1_c_l_depthwise_convolution_layer3x3_n_h_w_c_kernel.xhtml">CLDepthwiseConvolutionLayer3x3NHWCKernel</a> <a class="el" href="classarm__compute_1_1_c_l_depthwise_im2_col_kernel.xhtml">CLDepthwiseIm2ColKernel</a> <a class="el" href="classarm__compute_1_1_c_l_depthwise_vector_to_tensor_kernel.xhtml">CLDepthwiseVectorToTensorKernel</a> <a class="el" href="classarm__compute_1_1_c_l_depthwise_weights_reshape_kernel.xhtml">CLDepthwiseWeightsReshapeKernel</a> / <a class="el" href="classarm__compute_1_1_c_l_depthwise_convolution_layer3x3.xhtml">CLDepthwiseConvolutionLayer3x3</a> <a class="el" href="classarm__compute_1_1_c_l_depthwise_convolution_layer.xhtml">CLDepthwiseConvolutionLayer</a> <a class="el" href="classarm__compute_1_1_c_l_depthwise_separable_convolution_layer.xhtml">CLDepthwiseSeparableConvolutionLayer</a></li>
 <li><a class="el" href="classarm__compute_1_1_c_l_dequantization_layer_kernel.xhtml">CLDequantizationLayerKernel</a> / <a class="el" href="classarm__compute_1_1_c_l_dequantization_layer.xhtml">CLDequantizationLayer</a></li>
 <li><a class="el" href="classarm__compute_1_1_c_l_direct_convolution_layer_kernel.xhtml">CLDirectConvolutionLayerKernel</a> / <a class="el" href="classarm__compute_1_1_c_l_direct_convolution_layer.xhtml">CLDirectConvolutionLayer</a></li>
 <li><a class="el" href="classarm__compute_1_1_c_l_flatten_layer.xhtml">CLFlattenLayer</a></li>
@@ -741,7 +828,7 @@
 <dl class="section note"><dt>Note</dt><dd>If you want to natively compile for 32bit on a 64bit ARM device running a 64bit OS then you will have to use cross-compile too.</dd></dl>
 <p>There is also an 'embed_only' option which will generate all the .embed files for the OpenCL kernels and / or OpenGLES compute shaders. This might be useful if using a different build system to compile the library.</p>
 <p><b>Werror:</b> If you are compiling using the same toolchains as the ones used in this guide then there shouldn't be any warning and therefore you should be able to keep Werror=1. If with a different compiler version the library fails to build because of warnings interpreted as errors then, if you are sure the warnings are not important, you might want to try to build with Werror=0 (But please do report the issue either on Github or by an email to <a href="#" onclick="location.href='mai'+'lto:'+'dev'+'el'+'ope'+'r@'+'arm'+'.c'+'om'; return false;">devel<span style="display: none;">.nosp@m.</span>oper<span style="display: none;">.nosp@m.</span>@arm.<span style="display: none;">.nosp@m.</span>com</a> so that the issue can be addressed).</p>
-<p><b>opencl</b> / <b>neon</b> / <b><a class="el" href="namespacearm__compute_1_1gles__compute.xhtml">gles_compute</a>:</b> Choose which SIMD technology you want to target. (NEON for ARM Cortex-A CPUs or OpenCL / GLES_COMPUTE for ARM Mali GPUs)</p>
+<p><b>opencl</b> / <b>neon</b> / <b>gles_compute:</b> Choose which SIMD technology you want to target. (NEON for ARM Cortex-A CPUs or OpenCL / GLES_COMPUTE for ARM Mali GPUs)</p>
 <p><b>embed_kernels:</b> For OpenCL / GLES_COMPUTE only: set embed_kernels=1 if you want the OpenCL / GLES_COMPUTE kernels to be built in the library's binaries instead of being read from separate ".cl" / ".cs" files. If embed_kernels is set to 0 then the application can set the path to the folder containing the OpenCL / GLES_COMPUTE kernel files by calling <a class="el" href="classarm__compute_1_1_c_l_kernel_library.xhtml#af353532ea782387df6bcb6d01894f4ae" title="Initialises the kernel library. ">CLKernelLibrary::init()</a> / <a class="el" href="classarm__compute_1_1_g_c_kernel_library.xhtml#abe24625d55f2fb35da7e293e5e28d483" title="Initialises the kernel library. ">GCKernelLibrary::init()</a>. By default the path is set to "./cl_kernels" / "./cs_shaders".</p>
 <p><b>set_soname:</b> Do you want to build the versioned version of the library ?</p>
 <p>If enabled the library will contain a SONAME and SHLIBVERSION and some symlinks will automatically be created between the objects. Example: libarm_compute_core.so -&gt; libarm_compute_core.so.1.0.0 libarm_compute_core.so.1 -&gt; libarm_compute_core.so.1.0.0 libarm_compute_core.so.1.0.0</p>
@@ -767,9 +854,6 @@
 <li>gcc-linaro-4.9-2016.02-x86_64_aarch64-linux-gnu</li>
 <li>gcc-linaro-6.3.1-2017.02-i686_aarch64-linux-gnu</li>
 </ul>
-<dl class="section note"><dt>Note</dt><dd>If you are building with opencl=1 then scons will expect to find libOpenCL.so either in the current directory or in "build" (See the section below if you need a stub OpenCL library to link against) </dd>
-<dd>
-If you are building with <a class="el" href="namespacearm__compute_1_1gles__compute.xhtml">gles_compute</a>=1 then scons will expect to find libEGL.so / libGLESv1_CM.so / libGLESv2.so either in the current directory or in "build" (See the section below if you need a stub OpenCL library to link against)</dd></dl>
 <p>To cross-compile the library in debug mode, with NEON only support, for Linux 32bit: </p><pre class="fragment">scons Werror=1 -j8 debug=1 neon=1 opencl=0 os=linux arch=armv7a
 </pre><p>To cross-compile the library in asserts mode, with OpenCL only support, for Linux 64bit: </p><pre class="fragment">scons Werror=1 -j8 debug=0 asserts=1 neon=0 opencl=1 embed_kernels=1 os=linux arch=arm64-v8a
 </pre><p>To cross-compile the library in asserts mode, with GLES_COMPUTE only support, for Linux 64bit: </p><pre class="fragment">scons Werror=1 -j8 debug=0 asserts=1 neon=0 opencl=0 gles_compute=1 embed_kernels=1 os=linux arch=arm64-v8a
@@ -783,12 +867,12 @@
 <h3><a class="anchor" id="S3_2_2_examples"></a>
 How to manually build the examples ?</h3>
 <p>The examples get automatically built by scons as part of the build process of the library described above. This section just describes how you can build and link your own application against our library.</p>
-<dl class="section note"><dt>Note</dt><dd>The following command lines assume the <a class="el" href="namespacearm__compute.xhtml" title="This file contains all available output stages for GEMMLowp on OpenCL. ">arm_compute</a> and libOpenCL binaries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built library with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>The following command lines assume the <a class="el" href="namespacearm__compute.xhtml" title="This file contains all available output stages for GEMMLowp on OpenCL. ">arm_compute</a> binaries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built library with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.</dd></dl>
 <p>To cross compile a NEON example for Linux 32bit: </p><pre class="fragment">arm-linux-gnueabihf-g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -mfpu=neon -L. -larm_compute -larm_compute_core -o neon_convolution
 </pre><p>To cross compile a NEON example for Linux 64bit: </p><pre class="fragment">aarch64-linux-gnu-g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -L. -larm_compute -larm_compute_core -o neon_convolution
 </pre><p>(notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)</p>
-<p>To cross compile an OpenCL example for Linux 32bit: </p><pre class="fragment">arm-linux-gnueabihf-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -mfpu=neon -L. -larm_compute -larm_compute_core -lOpenCL -o cl_convolution -DARM_COMPUTE_CL
-</pre><p>To cross compile an OpenCL example for Linux 64bit: </p><pre class="fragment">aarch64-linux-gnu-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -L. -larm_compute -larm_compute_core -lOpenCL -o cl_convolution -DARM_COMPUTE_CL
+<p>To cross compile an OpenCL example for Linux 32bit: </p><pre class="fragment">arm-linux-gnueabihf-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -mfpu=neon -L. -larm_compute -larm_compute_core -o cl_convolution -DARM_COMPUTE_CL
+</pre><p>To cross compile an OpenCL example for Linux 64bit: </p><pre class="fragment">aarch64-linux-gnu-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -L. -larm_compute -larm_compute_core -o cl_convolution -DARM_COMPUTE_CL
 </pre><p>To cross compile a GLES example for Linux 32bit: </p><pre class="fragment">arm-linux-gnueabihf-g++ examples/gc_absdiff.cpp utils/Utils.cpp -I. -Iinclude/ -L. -larm_compute -larm_compute_core -std=c++11 -mfpu=neon -DARM_COMPUTE_GC -Iinclude/linux/ -o gc_absdiff
 </pre><p>To cross compile a GLES example for Linux 64bit: </p><pre class="fragment">aarch64-linux-gnu-g++ examples/gc_absdiff.cpp utils/Utils.cpp -I. -Iinclude/ -L. -larm_compute -larm_compute_core -std=c++11 -DARM_COMPUTE_GC -Iinclude/linux/ -o gc_absdiff
 </pre><p>(notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)</p>
@@ -801,7 +885,7 @@
 <p>To compile natively (i.e directly on an ARM device) for NEON for Linux 32bit: </p><pre class="fragment">g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -mfpu=neon -larm_compute -larm_compute_core -o neon_convolution
 </pre><p>To compile natively (i.e directly on an ARM device) for NEON for Linux 64bit: </p><pre class="fragment">g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute -larm_compute_core -o neon_convolution
 </pre><p>(notice the only difference with the 32 bit command is that we don't need the -mfpu option)</p>
-<p>To compile natively (i.e directly on an ARM device) for OpenCL for Linux 32bit or Linux 64bit: </p><pre class="fragment">g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute -larm_compute_core -lOpenCL -o cl_convolution -DARM_COMPUTE_CL
+<p>To compile natively (i.e directly on an ARM device) for OpenCL for Linux 32bit or Linux 64bit: </p><pre class="fragment">g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute -larm_compute_core -o cl_convolution -DARM_COMPUTE_CL
 </pre><p>To compile natively (i.e directly on an ARM device) for GLES for Linux 32bit or Linux 64bit: </p><pre class="fragment">g++ examples/gc_absdiff.cpp utils/Utils.cpp -I. -Iinclude/ -L. -larm_compute -larm_compute_core -std=c++11 -DARM_COMPUTE_GC -Iinclude/linux/ -o gc_absdiff
 </pre><p>To compile natively the examples with the Graph API, such as <a class="el" href="graph__lenet_8cpp.xhtml">graph_lenet.cpp</a>, you need to link the examples against arm_compute_graph.so too. </p><dl class="section note"><dt>Note</dt><dd>The compute library must currently be built with both neon and opencl enabled - neon=1 and opencl=1</dd></dl>
 <p>i.e. to natively compile the "graph_lenet" example for Linux 32bit: </p><pre class="fragment">g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp -I. -Iinclude -std=c++11 -mfpu=neon -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
@@ -813,12 +897,16 @@
 <p>To run the built executable simply run: </p><pre class="fragment">LD_LIBRARY_PATH=build ./neon_convolution
 </pre><p>or </p><pre class="fragment">LD_LIBRARY_PATH=build ./cl_convolution
 </pre><dl class="section note"><dt>Note</dt><dd>Examples accept different types of arguments, to find out what they are run the example without any argument and the help will be displayed at the beginning of the run.</dd></dl>
-<p>For example: LD_LIBRARY_PATH=. ./graph_lenet</p>
-<p>./graph_lenet</p>
-<p>Usage: ./graph_lenet [target] [path_to_data] [batches]</p>
-<p>No data folder provided: using random values</p>
-<p>Test passed</p>
-<p>In this case the first argument of LeNet (like all the graph examples) is the target (i.e 0 to run on NEON, 1 to run on OpenCL if available, 2 to run on OpenCL using the <a class="el" href="classarm__compute_1_1_c_l_tuner.xhtml" title="Basic implementation of the OpenCL tuner interface. ">CLTuner</a>), the second argument is the path to the folder containing the npy files for the weights and finally the third argument is the number of batches to run.</p>
+<p>For example: </p><pre class="fragment">LD_LIBRARY_PATH=. ./graph_lenet
+
+./graph_lenet
+
+Usage: ./graph_lenet [target] [path_to_data] [batches]
+
+No data folder provided: using random values
+
+Test passed
+</pre><p>In this case the first argument of LeNet (like all the graph examples) is the target (i.e 0 to run on NEON, 1 to run on OpenCL if available, 2 to run on OpenCL using the <a class="el" href="classarm__compute_1_1_c_l_tuner.xhtml" title="Basic implementation of the OpenCL tuner interface. ">CLTuner</a>), the second argument is the path to the folder containing the npy files for the weights and finally the third argument is the number of batches to run.</p>
 <h2><a class="anchor" id="S3_3_android"></a>
 Building for Android</h2>
 <p>For Android, the library was successfully built and tested using Google's standalone toolchains:</p><ul>
@@ -829,39 +917,39 @@
 <ul>
 <li>Download the NDK r16b from here: <a href="https://developer.android.com/ndk/downloads/index.html">https://developer.android.com/ndk/downloads/index.html</a></li>
 <li>Make sure you have Python 2 installed on your machine.</li>
-<li><p class="startli">Generate the 32 and/or 64 toolchains by running the following commands:</p>
-<p class="startli">$NDK/build/tools/make_standalone_toolchain.py &ndash;arch arm64 &ndash;install-dir $MY_TOOLCHAINS/aarch64-linux-android-ndk-r16b &ndash;stl gnustl &ndash;api 21 $NDK/build/tools/make_standalone_toolchain.py &ndash;arch arm &ndash;install-dir $MY_TOOLCHAINS/arm-linux-android-ndk-r16b &ndash;stl gnustl &ndash;api 21</p>
-</li>
+<li>Generate the 32 and/or 64 toolchains by running the following commands:</li>
 </ul>
-<dl class="section attention"><dt>Attention</dt><dd>Due to some NDK issues make sure you use clang++ &amp; gnustl</dd></dl>
-<dl class="section note"><dt>Note</dt><dd>Make sure to add the toolchains to your PATH: export PATH=$PATH:$MY_TOOLCHAINS/aarch64-linux-android-4.9/bin:$MY_TOOLCHAINS/arm-linux-androideabi-4.9/bin</dd></dl>
+<pre class="fragment">$NDK/build/tools/make_standalone_toolchain.py --arch arm64 --install-dir $MY_TOOLCHAINS/aarch64-linux-android-ndk-r16b --stl gnustl --api 21
+$NDK/build/tools/make_standalone_toolchain.py --arch arm --install-dir $MY_TOOLCHAINS/arm-linux-android-ndk-r16b --stl gnustl --api 21
+</pre><dl class="section attention"><dt>Attention</dt><dd>Due to some NDK issues make sure you use clang++ &amp; gnustl</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>Make sure to add the toolchains to your PATH: <pre class="fragment">export PATH=$PATH:$MY_TOOLCHAINS/aarch64-linux-android-ndk-r16b/bin:$MY_TOOLCHAINS/arm-linux-android-ndk-r16b/bin
+</pre></dd></dl>
 <h3><a class="anchor" id="S3_3_1_library"></a>
 How to build the library ?</h3>
-<dl class="section note"><dt>Note</dt><dd>If you are building with opencl=1 then scons will expect to find libOpenCL.so either in the current directory or in "build" (See the section below if you need a stub OpenCL library to link against)</dd></dl>
 <p>To cross-compile the library in debug mode, with NEON only support, for Android 32bit: </p><pre class="fragment">CXX=clang++ CC=clang scons Werror=1 -j8 debug=1 neon=1 opencl=0 os=android arch=armv7a
 </pre><p>To cross-compile the library in asserts mode, with OpenCL only support, for Android 64bit: </p><pre class="fragment">CXX=clang++ CC=clang scons Werror=1 -j8 debug=0 asserts=1 neon=0 opencl=1 embed_kernels=1 os=android arch=arm64-v8a
 </pre><p>To cross-compile the library in asserts mode, with GLES_COMPUTE only support, for Android 64bit: </p><pre class="fragment">CXX=clang++ CC=clang scons Werror=1 -j8 debug=0 asserts=1 neon=0 opencl=0 gles_compute=1 embed_kernels=1 os=android arch=arm64-v8a
 </pre><h3><a class="anchor" id="S3_3_2_examples"></a>
 How to manually build the examples ?</h3>
 <p>The examples get automatically built by scons as part of the build process of the library described above. This section just describes how you can build and link your own application against our library.</p>
-<dl class="section note"><dt>Note</dt><dd>The following command lines assume the <a class="el" href="namespacearm__compute.xhtml" title="This file contains all available output stages for GEMMLowp on OpenCL. ">arm_compute</a> and libOpenCL binaries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built library with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>The following command lines assume the <a class="el" href="namespacearm__compute.xhtml" title="This file contains all available output stages for GEMMLowp on OpenCL. ">arm_compute</a> binaries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built library with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.</dd></dl>
 <p>Once you've got your Android standalone toolchain built and added to your path you can do the following:</p>
 <p>To cross compile a NEON example: </p><pre class="fragment">#32 bit:
 arm-linux-androideabi-clang++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -larm_compute_core-static -L. -o neon_convolution_arm -static-libstdc++ -pie
 #64 bit:
 aarch64-linux-android-clang++ examples/neon_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -larm_compute_core-static -L. -o neon_convolution_aarch64 -static-libstdc++ -pie
 </pre><p>To cross compile an OpenCL example: </p><pre class="fragment">#32 bit:
-arm-linux-androideabi-clang++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -larm_compute_core-static -L. -o cl_convolution_arm -static-libstdc++ -pie -lOpenCL -DARM_COMPUTE_CL
+arm-linux-androideabi-clang++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -larm_compute_core-static -L. -o cl_convolution_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
 #64 bit:
-aarch64-linux-android-clang++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -larm_compute_core-static -L. -o cl_convolution_aarch64 -static-libstdc++ -pie -lOpenCL -DARM_COMPUTE_CL
+aarch64-linux-android-clang++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -larm_compute_core-static -L. -o cl_convolution_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
 </pre><p>To cross compile a GLES example: </p><pre class="fragment">#32 bit:
 arm-linux-androideabi-clang++ examples/gc_absdiff.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -larm_compute_core-static -L. -o gc_absdiff_arm -static-libstdc++ -pie -DARM_COMPUTE_GC
 #64 bit:
 aarch64-linux-android-clang++ examples/gc_absdiff.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute-static -larm_compute_core-static -L. -o gc_absdiff_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_GC
 </pre><p>To cross compile the examples with the Graph API, such as <a class="el" href="graph__lenet_8cpp.xhtml">graph_lenet.cpp</a>, you need to link the library arm_compute_graph also. (notice the compute library has to be built with both neon and opencl enabled - neon=1 and opencl=1) </p><pre class="fragment">#32 bit:
-arm-linux-androideabi-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp -I. -Iinclude -std=c++11 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_arm -static-libstdc++ -pie -lOpenCL -DARM_COMPUTE_CL
+arm-linux-androideabi-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp -I. -Iinclude -std=c++11 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_arm -static-libstdc++ -pie -DARM_COMPUTE_CL
 #64 bit:
-aarch64-linux-android-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp -I. -Iinclude -std=c++11 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_aarch64 -static-libstdc++ -pie -lOpenCL -DARM_COMPUTE_CL
+aarch64-linux-android-clang++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp -I. -Iinclude -std=c++11 -Wl,--whole-archive -larm_compute_graph-static -Wl,--no-whole-archive -larm_compute-static -larm_compute_core-static -L. -o graph_lenet_aarch64 -static-libstdc++ -pie -DARM_COMPUTE_CL
 </pre><dl class="section note"><dt>Note</dt><dd>Due to some issues in older versions of the Mali OpenCL DDK (&lt;= r13p0), we recommend to link <a class="el" href="namespacearm__compute.xhtml" title="This file contains all available output stages for GEMMLowp on OpenCL. ">arm_compute</a> statically on Android. </dd>
 <dd>
 When linked statically the arm_compute_graph library currently needs the &ndash;whole-archive linker flag in order to work properly</dd></dl>
@@ -941,7 +1029,7 @@
 <!-- start footer part -->
 <div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
   <ul>
-    <li class="footer">Generated on Fri Mar 2 2018 12:38:01 for Compute Library by
+    <li class="footer">Generated on Wed May 23 2018 11:36:45 for Compute Library by
     <a href="http://www.doxygen.org/index.html">
     <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.11 </li>
   </ul>
commit	b3a371bc429d2ba45e56baaf239d8200c2662a74	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Wed May 23 11:36:53 2018 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Wed May 23 14:55:11 2018 +0100
tree	554525e415c303d64a08722a755397852ebbb8e4
parent	67c8c91522e5be8156b77f57e63c0253535c902a [diff] [blame]