arm_compute v17.06
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index d16354f..82929ba 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -30,6 +30,7 @@
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
 #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
 #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
@@ -41,6 +42,7 @@
 #include "arm_compute/runtime/CL/functions/CLColorConvert.h"
 #include "arm_compute/runtime/CL/functions/CLConvolution.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
 #include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
 #include "arm_compute/runtime/CL/functions/CLDerivative.h"
 #include "arm_compute/runtime/CL/functions/CLDilate.h"
@@ -55,11 +57,16 @@
 #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
 #include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
 #include "arm_compute/runtime/CL/functions/CLHistogram.h"
 #include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
 #include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
 #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLMagnitude.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
 #include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
diff --git a/arm_compute/runtime/CL/CLHOG.h b/arm_compute/runtime/CL/CLHOG.h
new file mode 100644
index 0000000..9b4a303
--- /dev/null
+++ b/arm_compute/runtime/CL/CLHOG.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOG_H__
+#define __ARM_COMPUTE_CLHOG_H__
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** OpenCL implementation of HOG data-object */
+class CLHOG : public ICLHOG
+{
+public:
+    /** Default constructor */
+    CLHOG();
+    /** Allocate the HOG descriptor using the given HOG's metadata
+     *
+     * @param[in] input HOG's metadata used to allocate the HOG descriptor
+     */
+    void init(const HOGInfo &input);
+
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLHOG::map;
+
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLHOG::unmap;
+
+    // Inherited method overridden:
+    void              free() override;
+    const HOGInfo    *info() const override;
+    const cl::Buffer &cl_buffer() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    HOGInfo    _info;
+    cl::Buffer _buffer;
+};
+}
+#endif /* __ARM_COMPUTE_CLHOG_H__ */
diff --git a/arm_compute/runtime/CL/CLMultiHOG.h b/arm_compute/runtime/CL/CLMultiHOG.h
new file mode 100644
index 0000000..17bb4e0
--- /dev/null
+++ b/arm_compute/runtime/CL/CLMultiHOG.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMULTIHOG_H__
+#define __ARM_COMPUTE_CLMULTIHOG_H__
+
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLHOG.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic implementation of the CL multi HOG data-objects */
+class CLMultiHOG : public ICLMultiHOG
+{
+public:
+    /** Constructor
+     *
+     * @param[in] num_models Number of HOG data objects to contain
+     *
+     */
+    CLMultiHOG(size_t num_models);
+
+    // Inherited methods overridden:
+    size_t  num_models() const override;
+    ICLHOG *cl_model(size_t index) override;
+    const ICLHOG *cl_model(size_t index) const override;
+
+private:
+    size_t                   _num_models;
+    std::unique_ptr<CLHOG[]> _model;
+};
+}
+#endif /*__ARM_COMPUTE_CLMULTIHOG_H__ */
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 71baa55..8e80259 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -24,8 +24,12 @@
 #ifndef __ARM_COMPUTE_CLSCHEDULER_H__
 #define __ARM_COMPUTE_CLSCHEDULER_H__
 
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
@@ -50,7 +54,7 @@
     void default_init()
     {
         CLKernelLibrary::get().init("./cl_kernels/", cl::Context::getDefault(), cl::Device::getDefault());
-        init(cl::Context::getDefault(), cl::CommandQueue::getDefault());
+        init(cl::Context::getDefault(), cl::CommandQueue::getDefault(), cl::Device::getDefault());
     }
     /** Schedule the execution of the passed kernel if possible.
      *
@@ -63,11 +67,14 @@
      *
      * @param[in] context A CL context.
      * @param[in] queue   A CL command queue.
+     * @param[in] device  A CL device.
      */
-    void init(cl::Context context = cl::Context::getDefault(), cl::CommandQueue queue = cl::CommandQueue::getDefault())
+    void init(cl::Context context = cl::Context::getDefault(), cl::CommandQueue queue = cl::CommandQueue::getDefault(),
+              cl::Device device = cl::Device::getDefault())
     {
         _context = std::move(context);
         _queue   = std::move(queue);
+        _target  = get_target_from_device(device);
     }
 
     /** Accessor for the associated CL context.
@@ -97,6 +104,15 @@
         return _queue;
     }
 
+    /** Get the target GPU.
+     *
+     * @return The target GPU.
+     */
+    GPUTarget target() const
+    {
+        return _target;
+    }
+
     /** Accessor to set the CL command queue to be used by the scheduler.
      *
      * @param[in] queue A CL command queue.
@@ -106,6 +122,15 @@
         _queue = std::move(queue);
     }
 
+    /** Accessor to set target GPU to be used by the scheduler.
+     *
+     * @param[in] target The target GPU.
+     */
+    void set_target(GPUTarget target)
+    {
+        _target = target;
+    }
+
     /** Blocks until all commands in the associated command queue have finished. */
     void sync()
     {
@@ -127,6 +152,7 @@
 private:
     cl::Context      _context;
     cl::CommandQueue _queue;
+    GPUTarget        _target;
 };
 }
 #endif /* __ARM_COMPUTE_CLSCHEDULER_H__ */
diff --git a/arm_compute/runtime/CL/CLSubTensor.h b/arm_compute/runtime/CL/CLSubTensor.h
new file mode 100644
index 0000000..4bab164
--- /dev/null
+++ b/arm_compute/runtime/CL/CLSubTensor.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSUBTENSOR_H__
+#define __ARM_COMPUTE_CLSUBTENSOR_H__
+
+#include "arm_compute/core/SubTensorInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorInfo;
+
+/** Basic implementation of the OpenCL sub-tensor interface */
+class CLSubTensor : public ICLTensor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] parent       Parent tensor
+     * @param[in] tensor_shape Shape of the subtensor
+     * @param[in] coords       Coordinates of the first subtensor element inside the parent tensor.
+     */
+    CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    /** Destructor: free the tensor's memory */
+    ~CLSubTensor() = default;
+    /** Restrict instances of this class to be copy constructed */
+    CLSubTensor(const CLSubTensor &) = delete;
+    /** Restrict instances of this class to be copied */
+    CLSubTensor &operator=(const CLSubTensor &) = delete;
+    /** Allow instances of this class to be move constructed */
+    CLSubTensor(CLSubTensor &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSubTensor &operator=(CLSubTensor &&) = default;
+
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @note Mapping a subtensor will lead to the mapping of the whole parent tensor for now.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLTensor::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note Unmapping a subtensor will lead to the unmapping of the whole parent tensor for now.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLTensor::unmap;
+
+    /** Return the parent tensor of the subtensor
+     *
+     * @return Parent tensor
+     */
+    ICLTensor *parent();
+
+    // Inherited methods overridden:
+    ITensorInfo      *info() const override;
+    ITensorInfo      *info() override;
+    const cl::Buffer &cl_buffer() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    ICLTensor            *_parent;
+    mutable SubTensorInfo _info;
+};
+}
+#endif /*__ARM_COMPUTE_CLSUBTENSOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
new file mode 100644
index 0000000..d766d1c
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLNormalizationLayerKernel and simulate a batch normalization layer.
+ *
+ * Batch normalization is calculated by:
+ * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
+ *
+ */
+class CLBatchNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLBatchNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLBatchNormalizationLayerKernel _norm_kernel; /**< BatchNormalization layer kernel to run */
+};
+}
+#endif /* __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index 2a9b487..6a40396 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -27,12 +27,12 @@
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-#include "arm_compute/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.h"
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
@@ -42,6 +42,34 @@
 {
 class ICLTensor;
 
+/** Function to reshape and transpose the weights. This function calls the following kernels:
+ * -# @ref CLWeightsReshapeKernel
+ * -# @ref CLGEMMTranspose1xWKernel
+ */
+class CLConvolutionLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    CLConvolutionLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: F32.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] output       Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
+     *                          Data types supported: Same as @p weights.
+     */
+    void configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLConvolutionLayerWeightsReshapeKernel _weights_reshape_kernel;
+    CLGEMMTranspose1xWKernel               _weights_transposed_kernel;
+    CLTensor                               _weights_reshaped;
+    bool                                   _transpose1xW;
+};
+
 /** Basic function to compute the convolution layer. This function calls the following OpenCL kernels:
  *
  * -# @ref CLConvolutionLayerWeightsReshapeKernel (executed only once for each configuration)
@@ -58,35 +86,36 @@
     CLConvolutionLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F16, F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                          while every optional dimension from 4 and above represent a batch of inputs.
+     *                          Data types supported: F16, F32.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                          Data types supported: Same as @p input.
+     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    CLIm2ColKernel                         _input_im2col_kernel;
-    CLConvolutionLayerWeightsReshapeKernel _weights_reshape_kernel;
-    CLGEMMInterleave4x4Kernel              _input_interleave_kernel;
-    CLGEMMTranspose1xWKernel               _weights_transposed_kernel;
-    CLGEMMMatrixMultiplyKernel             _mm_kernel;
-    CLCol2ImKernel                         _output_col2im_kernel;
-    CLTensor                               _input_im2col_reshaped;
-    CLTensor                               _input_interleaved_reshaped;
-    CLTensor                               _weights_reshaped;
-    CLTensor                               _weights_transposed;
-    CLTensor                               _gemm_output;
-    bool                                   _is_first_run;
-    bool                                   _has_bias;
-    bool                                   _is_fc;
+    CLConvolutionLayerReshapeWeights _reshape_weights;
+    CLIm2ColKernel                   _input_im2col_kernel;
+    CLGEMMInterleave4x4Kernel        _input_interleave_kernel;
+    CLGEMMMatrixMultiplyKernel       _mm_kernel;
+    CLCol2ImKernel                   _output_col2im_kernel;
+    CLTensor                         _input_im2col_reshaped;
+    CLTensor                         _input_interleaved_reshaped;
+    CLTensor                         _weights_reshaped;
+    CLTensor                         _weights_transposed;
+    CLTensor                         _gemm_output;
+    bool                             _has_bias;
+    bool                             _is_fully_connected_convolution;
+    bool                             _are_weights_reshaped;
 };
 }
 #endif /* __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
new file mode 100644
index 0000000..3199936
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHCONCATENATE_H__
+#define __ARM_COMPUTE_CLDEPTHCONCATENATE_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+class CLDepthConcatenateKernel;
+class CLFillBorderKernel;
+
+/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
+ * -# @ref CLDepthConcatenateKernel
+ *
+ */
+class CLDepthConcatenate : public IFunction
+{
+public:
+    /** Default constructor */
+    CLDepthConcatenate();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  F32.
+     * @param[out]    output        Output tensor. Data types supported: F32.
+     */
+    void configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::vector<ICLTensor *>                    _inputs_vector;
+    std::unique_ptr<CLDepthConcatenateKernel[]> _concat_kernels_vector;
+    std::unique_ptr<CLFillBorderKernel[]>       _border_handlers_vector;
+    unsigned int                                _num_inputs;
+};
+}
+#endif /* __ARM_COMPUTE_CLDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index 09e4fc9..826f445 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -36,13 +36,44 @@
 
 namespace arm_compute
 {
+/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls the following kernels:
+ *
+ *  -# @ref CLTransposeKernel        (if @p transpose_weights is set to true)
+ *  -# @ref CLGEMMTranspose1xWKernel (if @p is_batched_fc_layer is set to true)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CLFullyConnectedLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    CLFullyConnectedLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input               Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/F32.
+     * @param[out] output              Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights   True if the weights must be transposed. Data types supported: Same as @p weights.
+     * @param[in]  is_batched_fc_layer True if it is a batched fully connected layer
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLTransposeKernel        _transpose_kernel;
+    CLGEMMTranspose1xWKernel _transpose1xW_kernel;
+    CLTensor                 _transpose_output;
+    bool                     _transpose_weights;
+    bool                     _is_batched_fc_layer;
+};
+
 /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
  *
  *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref CLTransposeKernel (if @p transpose_weights is set to true) (called once)
- *  -# @ref NEGEMMTranspose1xWKernel (called once if we have a multi-batch input)
- *  -# @ref NEGEMMInterleave4x4Kernel (called if we have a multi-batch input)
- *  -# @ref NEGEMMMatrixMultiplyKernel
+ *  -# @ref CLFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false) (called once)
+ *  -# @ref CLGEMMInterleave4x4Kernel (called if we have a multi-batch input)
+ *  -# @ref CLGEMMMatrixMultiplyKernel
  *  -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
@@ -54,13 +85,14 @@
     CLFullyConnectedLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input             Source tensor. Data type supported: F16, F32.
-     * @param[in]  weights           Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input
-     * @param[in]  biases            Bias tensor. It can be nullptr. Data type supported:Same as @p input.
-     * @param[out] output            Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  transpose_weights (Optional) Transpose weights if true. Defaults to true.
+     * @param[in]  input                Source tensor. Data type supported: F16/F32.
+     * @param[in]  weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input
+     * @param[in]  biases               Bias tensor. It can be nullptr. Data type supported:Same as @p input.
+     * @param[out] output               Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights    (Optional) Transpose weights if true. Defaults to true.
+     * @param[in]  are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false.
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights = true);
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights = true, bool are_weights_reshaped = false);
 
     //Inherited methods override
     void run() override;
@@ -71,21 +103,18 @@
     void configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
     void configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
 
-    CLIm2ColKernel                     _im2col_kernel;
-    CLTransposeKernel                  _transpose_kernel;
-    CLGEMMTranspose1xWKernel           _transpose1xW_kernel;
-    CLGEMMInterleave4x4Kernel          _interleave4x4_kernel;
-    CLGEMMMatrixMultiplyKernel         _mm_kernel;
-    CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
-    CLTensor                           _im2col_output;
-    CLTensor                           _interleave4x4_output;
-    CLTensor                           _transpose_output;
-    CLTensor                           _transpose1xW_output;
-    bool                               _is_first_run;
-    bool                               _transpose_weights;
-    bool                               _fc_after_conv;
-    bool                               _batched_fc_layer;
-    bool                               _accumulate_biases;
+    CLIm2ColKernel                      _im2col_kernel;
+    CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+    CLGEMMInterleave4x4Kernel           _interleave4x4_kernel;
+    CLGEMMMatrixMultiplyKernel          _mm_kernel;
+    CLGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
+    CLTensor                            _im2col_output;
+    CLTensor                            _interleave4x4_output;
+    CLTensor                            _reshape_weights_output;
+    bool                                _are_weights_reshaped;
+    bool                                _is_fc_after_conv;
+    bool                                _is_batched_fc_layer;
+    bool                                _accumulate_biases;
 };
 }
 #endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
new file mode 100644
index 0000000..cdb23bf
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDESCRIPTOR_H__
+#define __ARM_COMPUTE_CLHOGDESCRIPTOR_H__
+
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class IHOG;
+/** Basic function to calculate HOG descriptor. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLHOGGradient
+ * -# @ref CLHOGOrientationBinningKernel
+ * -# @ref CLHOGBlockNormalizationKernel
+ *
+ */
+class CLHOGDescriptor : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGDescriptor();
+    /** Initialise the function's source, destination, HOG data-object and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block
+     * @param[in]      hog                   HOG data object which describes the HOG descriptor
+     * @param[in]      border_mode           Border mode to use.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLHOGGradient                 _gradient;
+    CLHOGOrientationBinningKernel _orient_bin;
+    CLHOGBlockNormalizationKernel _block_norm;
+    CLTensor                      _mag;
+    CLTensor                      _phase;
+    CLTensor                      _hog_space;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGDESCRIPTOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h
new file mode 100644
index 0000000..0b4fad7
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGDetector.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDETECTOR_H__
+#define __ARM_COMPUTE_CLHOGDETECTOR_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to execute HOG detector based on linear SVM. This function calls the following OpenCL kernel:
+ *
+ * -# @ref CLHOGDetectorKernel
+ *
+ */
+class CLHOGDetector : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGDetector();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetector(const CLHOGDetector &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetector &operator=(const CLHOGDetector &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGDetector(CLHOGDetector &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGDetector &operator=(CLHOGDetector &&) = default;
+    /** Default destructor */
+    ~CLHOGDetector() = default;
+    /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
+     *
+     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
+     *
+     * @param[in]  input                   Input tensor. It is the output of @ref NEHOGDescriptor. Data type supported: F32
+     * @param[in]  hog                     HOG data-object that describes the HOG descriptor
+     * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the block stride stored in hog
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, size_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLHOGDetectorKernel      _hog_detector_kernel;
+    ICLDetectionWindowArray *_detection_windows;
+    cl::Buffer               _num_detection_windows;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGDETECTOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h
new file mode 100644
index 0000000..e74a684
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGGradient.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGGRADIENT_H__
+#define __ARM_COMPUTE_CLHOGGRADIENT_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLDerivative.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic function to calculate the gradient for HOG. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLDerivative
+ * -# @ref CLMagnitudePhaseKernel
+ *
+ */
+class CLHOGGradient : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGGradient();
+    /** Initialise the function's source, destinations, phase type and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8.
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_magnitude      Output tensor (magnitude). Data type supported: U16.
+     * @param[out]     output_phase          Output tensor.(phase). Format supported: U8
+     * @param[in]      phase_type            Type of @ref PhaseType
+     * @param[in]      border_mode           Border mode to use
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLDerivative           _derivative;
+    CLMagnitudePhaseKernel _mag_phase;
+    CLTensor               _gx;
+    CLTensor               _gy;
+};
+}
+#endif /*__ARM_COMPUTE_CLHOGGRADIENT_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
new file mode 100644
index 0000000..3fe0fa9
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGMULTIDETECTION_H__
+#define __ARM_COMPUTE_CLHOGMULTIDETECTION_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following kernels:
+ *
+ * -# @ref CLHOGGradient
+ * -# @ref CLHOGOrientationBinningKernel
+ * -# @ref CLHOGBlockNormalizationKernel
+ * -# @ref CLHOGDetector
+ * -# @ref CPPDetectionWindowNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
+ *
+ * @note This implementation works if all the HOG data-objects within the IMultiHOG container have the same:
+ *       -# Phase type
+         -# Normalization type
+         -# L2 hysteresis threshold if the normalization type is L2HYS_NORM
+ *
+ */
+class CLHOGMultiDetection : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGMultiDetection();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGMultiDetection(const CLHOGMultiDetection &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGMultiDetection &operator=(const CLHOGMultiDetection &) = delete;
+    /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
+     *
+     * @param[in, out] input                    Input tensor. Data type supported: U8
+     *                                          (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      multi_hog                Container of multiple HOG data object. Each HOG data object describes one HOG model to detect.
+     *                                          This container should store the HOG data-objects in descending or ascending cell_size width order.
+     *                                          This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects
+     * @param[out]     detection_windows        Array of @ref DetectionWindow used for locating the detected objects
+     * @param[in]      detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object
+     *                                          The dimension of this array must be the same of multi_hog->num_models()
+     *                                          The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array
+     * @param[in]      border_mode              Border mode to use.
+     * @param[in]      constant_border_value    (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      threshold                (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]      non_maxima_suppression   (Optional) Flag to specify whether the non-maxima suppression is required or not.
+     *                                          True if the non-maxima suppression stage has to be computed
+     * @param[in]      min_distance             (Optional) Radial Euclidean distance to use for the non-maxima suppression stage
+     *
+     */
+    void configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+                   uint8_t constant_border_value = 0,
+                   float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLHOGGradient                                                 _gradient_kernel;
+    std::unique_ptr<CLHOGOrientationBinningKernel[]>              _orient_bin_kernel;
+    std::unique_ptr<CLHOGBlockNormalizationKernel[]>              _block_norm_kernel;
+    std::unique_ptr<CLHOGDetector[]>                              _hog_detect_kernel;
+    std::unique_ptr<CPPDetectionWindowNonMaximaSuppressionKernel> _non_maxima_kernel;
+    std::unique_ptr<CLTensor[]>                                   _hog_space;
+    std::unique_ptr<CLTensor[]>                                   _hog_norm_space;
+    ICLDetectionWindowArray                                      *_detection_windows;
+    CLTensor                                                      _mag;
+    CLTensor                                                      _phase;
+    bool                                                          _non_maxima_suppression;
+    size_t                                                        _num_orient_bin_kernel;
+    size_t                                                        _num_block_norm_kernel;
+    size_t                                                        _num_hog_detect_kernel;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGMULTIDETECTION_H__ */
\ No newline at end of file
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
new file mode 100644
index 0000000..b4e4691
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to compute the locally connected layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLLocallyConnectedLayerWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref CLIm2ColKernel
+ * -# @ref CLLocallyConnectedMatrixMultiplyKernel
+ * -# @ref CLCol2ImKernel
+ */
+class CLLocallyConnectedLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLLocallyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLIm2ColKernel                              _input_im2col_kernel;
+    CLLocallyConnectedLayerWeightsReshapeKernel _weights_reshape_kernel;
+    CLLocallyConnectedMatrixMultiplyKernel      _mm_kernel;
+    CLCol2ImKernel                              _output_col2im_kernel;
+    CLTensor                                    _input_im2col_reshaped;
+    CLTensor                                    _weights_reshaped;
+    CLTensor                                    _gemm_output;
+    bool                                        _is_first_run;
+};
+}
+#endif /* __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h
index 0828af6..7a37e5e 100644
--- a/arm_compute/runtime/CPP/CPPScheduler.h
+++ b/arm_compute/runtime/CPP/CPPScheduler.h
@@ -24,35 +24,28 @@
 #ifndef __ARM_COMPUTE_CPPSCHEDULER_H__
 #define __ARM_COMPUTE_CPPSCHEDULER_H__
 
-#include <cstddef>
+#include "arm_compute/runtime/IScheduler.h"
+
 #include <memory>
 
 namespace arm_compute
 {
-class ICPPKernel;
 class Thread;
 
-/** Pool of threads to automatically split a kernel's execution among several threads. */
-class CPPScheduler
+/** C++11 implementation of a pool of threads to automatically split a kernel's execution among several threads. */
+class CPPScheduler : public IScheduler
 {
-private:
-    /** Constructor: create a pool of threads. */
-    CPPScheduler();
-
 public:
-    /** Force the re-creation of the pool of threads to use the specified number of threads.
+    /** Sets the number of threads the scheduler will use to run the kernels.
      *
-     * @param[in] num_threads If set to 0, then std::thread::hardware_concurrency() threads will be used, otherwise the number of threads specified.
+     * @param[in] num_threads If set to 0, then the maximum number of threads supported by C++11 will be used, otherwise the number of threads specified.
      */
-    void force_number_of_threads(int num_threads);
+    void set_num_threads(unsigned int num_threads) override;
     /** Returns the number of threads that the CPPScheduler has in his pool.
      *
      * @return Number of threads available in CPPScheduler.
      */
-    int num_threads() const
-    {
-        return _num_threads;
-    }
+    unsigned int num_threads() const override;
     /** Access the scheduler singleton
      *
      * @return The scheduler
@@ -65,12 +58,15 @@
      * - The scheduler has been initialized with only one thread.
      *
      * @param[in] kernel          Kernel to execute.
-     * @param[in] split_dimension Dimension along which to split the kernel's execution window (By default 1/Y)
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
      */
-    void multithread(ICPPKernel *kernel, size_t split_dimension = 1);
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
 
 private:
-    int _num_threads;
+    /** Constructor: create a pool of threads. */
+    CPPScheduler();
+
+    unsigned int _num_threads;
     std::unique_ptr<Thread[], void (*)(Thread *)> _threads;
 };
 }
diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h
index 0cd21b9..a4e7ed1 100644
--- a/arm_compute/runtime/IFunction.h
+++ b/arm_compute/runtime/IFunction.h
@@ -36,7 +36,7 @@
      * - Multi-threading is used for the kernels which are parallelisable.
      * - By default std::thread::hardware_concurrency() threads are used.
      *
-     * @note @ref CPPScheduler::force_number_of_threads() can be used to manually set the number of threads
+     * @note @ref CPPScheduler::set_num_threads() can be used to manually set the number of threads
      *
      * For OpenCL kernels:
      * - All the kernels are enqueued on the queue associated with CLScheduler.
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
new file mode 100644
index 0000000..39c027c
--- /dev/null
+++ b/arm_compute/runtime/IScheduler.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ISCHEDULER_H__
+#define __ARM_COMPUTE_ISCHEDULER_H__
+
+namespace arm_compute
+{
+class ICPPKernel;
+
+/** Scheduler interface to run kernels */
+class IScheduler
+{
+public:
+    /** Destructor. */
+    virtual ~IScheduler() = default;
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads If set to 0, then one thread per CPU core available on the system will be used, otherwise the number of threads specified.
+     */
+    virtual void set_num_threads(unsigned int num_threads) = 0;
+    /** Returns the number of threads that the SingleThreadScheduler has in his pool.
+     *
+     * @return Number of threads available in SingleThreadScheduler.
+     */
+    virtual unsigned int num_threads() const = 0;
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    virtual void schedule(ICPPKernel *kernel, unsigned int split_dimension) = 0;
+};
+}
+#endif /* __ARM_COMPUTE_ISCHEDULER_H__ */
diff --git a/arm_compute/runtime/MultiHOG.h b/arm_compute/runtime/MultiHOG.h
index 486ae14..32bad70 100644
--- a/arm_compute/runtime/MultiHOG.h
+++ b/arm_compute/runtime/MultiHOG.h
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/HOG.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 /** CPU implementation of multi HOG data-object */
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index ef17599..daf76f3 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -30,6 +30,7 @@
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
 #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
 #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
@@ -41,9 +42,11 @@
 #include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolution.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
 #include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
 #include "arm_compute/runtime/NEON/functions/NEDerivative.h"
 #include "arm_compute/runtime/NEON/functions/NEDilate.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
 #include "arm_compute/runtime/NEON/functions/NEErode.h"
 #include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
@@ -65,6 +68,7 @@
 #include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
 #include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
diff --git a/arm_compute/runtime/NEON/NEScheduler.h b/arm_compute/runtime/NEON/NEScheduler.h
index c65d6b7..94c82b2 100644
--- a/arm_compute/runtime/NEON/NEScheduler.h
+++ b/arm_compute/runtime/NEON/NEScheduler.h
@@ -24,10 +24,10 @@
 #ifndef __ARM_COMPUTE_NESCHEDULER_H__
 #define __ARM_COMPUTE_NESCHEDULER_H__
 
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
+#include "arm_compute/runtime/Scheduler.h"
 
 namespace arm_compute
 {
-using NEScheduler = CPPScheduler;
+using NEScheduler = Scheduler;
 }
 #endif /*__ARM_COMPUTE_NESCHEDULER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index 3fb3e20..35366e1 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -41,7 +41,7 @@
 public:
     /** Set the input and output tensor.
      *
-     * @param[in]  input           Source tensor. Data type supported: F32.
+     * @param[in]  input           Source tensor. Data type supported: QS8/F32.
      * @param[out] output          Destination tensor. Data type supported: same as @p input
      * @param[in]  activation_info Activation layer parameters.
      */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index 8f66a6d..8e34e98 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -35,11 +35,11 @@
 class NEArithmeticAddition : public INESimpleFunction
 {
 public:
-    /** Initialise the kernel's inputs, output and convertion policy.
+    /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in]  input1 First tensor input. Data types supported: U8 or S16.
-     * @param[in]  input2 Second tensor input. Data types supported: U8 or S16.
-     * @param[out] output Output tensor. Data types supported: U8 or S16.
+     * @param[in]  input1 First tensor input. Data types supported: U8/S16.
+     * @param[in]  input2 Second tensor input. Data types supported: U8/S16.
+     * @param[out] output Output tensor. Data types supported: U8/S16.
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index d0eaff7..841b591 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -35,11 +35,11 @@
 class NEArithmeticSubtraction : public INESimpleFunction
 {
 public:
-    /** Initialise the kernel's inputs, output and convertion policy.
+    /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in]  input1 First tensor input. Data types supported: U8 or S16.
-     * @param[in]  input2 Second tensor input. Data types supported: U8 or S16.
-     * @param[out] output Output tensor. Data types supported: U8 or S16.
+     * @param[in]  input1 First tensor input. Data types supported: U8/S16.
+     * @param[in]  input2 Second tensor input. Data types supported: U8/S16.
+     * @param[out] output Output tensor. Data types supported: U8/S16.
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
new file mode 100644
index 0000000..b0b5c12
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NENormalizationLayerKernel and simulate a batch normalization layer.
+ *
+ * Batch normalization is calculated by:
+ * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
+ *
+ */
+class NEBatchNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NEBatchNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: QS8/F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEBatchNormalizationLayerKernel _norm_kernel; /**< Batch normalization layer kernel */
+};
+}
+#endif /* __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolution.h b/arm_compute/runtime/NEON/functions/NEConvolution.h
index 5c80977..1704d9f 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolution.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolution.h
@@ -49,7 +49,7 @@
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[out]    output                Destination tensor, Data types supported: U8/S16.
      * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
      * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
      * @param[in]     border_mode           Strategy to use for borders.
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index a6862ca..a8fff8d 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -27,12 +27,12 @@
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -40,11 +40,38 @@
 {
 class ITensor;
 
-/** Basic function to simulate a convolution layer. This function calls the following OpenCL kernels:
- * -# @ref NEConvolutionLayerWeightsReshapeKernel (executed only once for each configuration)
- * -# @ref NEGEMMTranspose1xWKernel               (executed only once for each configuration)
+/** Function to reshape and perform 1xW transposition on the weights. This function calls the following kernels:
+ * -# @ref NEWeightsReshapeKernel
+ * -# @ref NEGEMMTranspose1xWKernel (executed in case GEMM is required for the operation)
+ */
+class NEConvolutionLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    NEConvolutionLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QS8/F32.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] output       Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
+     *                          Data types supported: Same as @p weights.
+     */
+    void configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEWeightsReshapeKernel   _weights_reshape_kernel;
+    NEGEMMTranspose1xWKernel _weights_transposed_kernel;
+    Tensor                   _weights_reshaped;
+    bool                     _transpose1xW;
+};
+
+/** Basic function to simulate a convolution layer. This function calls the following NEON kernels:
+ * -# @ref NEWeightsReshapeKernel   (executed only once for each configuration)
  * -# @ref NEIm2ColKernel
- * -# @ref NEGEMMInterleave4x4Kernel
+ * -# @ref NEGEMMInterleave4x4Kernel (executed only in case GEMM is required for the operation)
  * -# @ref NEGEMMMatrixMultiplyKernel
  * -# @ref NECol2ImKernel
  */
@@ -55,34 +82,34 @@
     NEConvolutionLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                          while every optional dimension from 4 and above represent a batch of inputs.
+     *                          Data types supported: QS8/F32.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                          Data types supported: Same as @p input.
+     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info);
-
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo());
     // Inherited methods overridden:
     void run() override;
 
 private:
-    NEIm2ColKernel                         _input_im2col_kernel;
-    NEGEMMInterleave4x4Kernel              _input_interleave_kernel;
-    NEConvolutionLayerWeightsReshapeKernel _weights_reshape_kernel;
-    NEGEMMTranspose1xWKernel               _weights_transposed_kernel;
-    NEGEMMMatrixMultiplyKernel             _mm_kernel;
-    NECol2ImKernel                         _output_col2im_kernel;
-    Tensor                                 _input_im2col_reshaped;
-    Tensor                                 _input_interleaved_reshaped;
-    Tensor                                 _weights_reshaped;
-    Tensor                                 _weights_transposed;
-    Tensor                                 _gemm_output;
-    bool                                   _is_first_run;
-    bool                                   _has_bias;
+    NEIm2ColKernel                   _input_im2col_kernel;
+    NEGEMMInterleave4x4Kernel        _input_interleave_kernel;
+    NEConvolutionLayerReshapeWeights _reshape_weights;
+    NEGEMMMatrixMultiplyKernel       _mm_kernel;
+    NECol2ImKernel                   _output_col2im_kernel;
+    Tensor                           _input_im2col_reshaped;
+    Tensor                           _input_interleaved_reshaped;
+    Tensor                           _weights_reshaped;
+    Tensor                           _gemm_output;
+    bool                             _has_bias;
+    bool                             _is_fully_connected_convolution;
+    bool                             _are_weights_reshaped;
 };
 }
 #endif /* __ARM_COMPUTE_NECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
new file mode 100644
index 0000000..02ff122
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHCONCATENATE_H__
+#define __ARM_COMPUTE_NEDEPTHCONCATENATE_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+class NEDepthConcatenateKernel;
+class NEFillBorderKernel;
+
+/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
+ * -# @ref NEDepthConcatenateKernel
+ *
+ */
+class NEDepthConcatenate : public IFunction
+{
+public:
+    /** Default constructor */
+    NEDepthConcatenate();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  F32.
+     * @param[out]    output        Output tensor. Data types supported: F32.
+     */
+    void configure(std::vector<ITensor *> inputs_vector, ITensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::vector<ITensor *>                      _inputs_vector;
+    std::unique_ptr<NEDepthConcatenateKernel[]> _concat_kernels_vector;
+    std::unique_ptr<NEFillBorderKernel[]>       _border_handlers_vector;
+    unsigned int                                _num_inputs;
+};
+}
+#endif /* __ARM_COMPUTE_NEDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvert.h b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
index 21ccca3..7c59ce4 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvert.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
@@ -48,17 +48,18 @@
      * Input format must be different than output format.
      *
      * Valid conversions Input -> Output :
-     *    U8 -> U16, S16, U32, S32
-     *    U16 -> U8, U32, S32
-     *    S16 -> U8, U32, S32
-     *    U32 -> U8, U16, S16
-     *    S32 -> U8, U16, S16
+     *    QS8 -> F32
+     *    U8 -> U16, S16, S32
+     *    U16 -> U8, U32
+     *    S16 -> U8, S32
+     *    F32 -> QS8
      *
      *
-     * @param[in]  input  The input tensor to convert. Data type supported: U8, U16, S16, U32 or S32.
-     * @param[out] output The output tensor. Data type supported: U8, U16, S16, U32 or S32.
+     * @param[in]  input  The input tensor to convert. Data type supported: QS8/U8/U16/S16/F32.
+     * @param[out] output The output tensor. Data type supported: QS8/U8/U16/S16/U32/S32/F32.
      * @param[in]  policy Conversion policy.
      * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     *                    It is not used on fixed point conversion.
      */
     void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift);
 };
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
new file mode 100644
index 0000000..a356cac
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Function to run the direct convolution.
+ *
+ *  This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel for the input
+ * -# @ref NEDirectConvolutionLayerBiasAccumulateKernel
+ * -# @ref NEDirectConvolutionLayerKernel
+ */
+class NEDirectConvolutionLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEDirectConvolutionLayer();
+    /** Set the input, weights, biases and output tensors.
+      *
+      * @param[in, out] input     Input tensor. Data types supported: QS8/F32.
+      * @param[in]      weights   Set of kernels to convolve the input volume.
+      *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
+      *                           Data type supported: Same as @p input.
+      * @param[in]      bias      Set of biases. Data type supported: Same as @p input.
+      * @param[out]     output    Output tensor.
+      *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
+      */
+    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEDirectConvolutionLayerBiasAccumulateKernel _accumulate_bias_kernel;
+    NEDirectConvolutionLayerKernel               _conv_kernel;
+    NEFillBorderKernel                           _input_border_handler;
+    Tensor                                       _accumulator;
+};
+}
+#endif /* __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
index c69c285..b6b7e77 100644
--- a/arm_compute/runtime/NEON/functions/NEFillBorder.h
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h
@@ -41,7 +41,7 @@
      *
      * @note This function fills the borders within the XY-planes.
      *
-     * @param[in, out] input                 Source tensor. Data type supported: U8, S16, S32, F32
+     * @param[in, out] input                 Source tensor. Data type supported: U8/QS8/S16/S32/F32
      * @param[in]      border_width          Width of the tensor border in pixels.
      * @param[in]      border_mode           Strategy to use for borders.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 69e27b8..33ec4ef 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -36,10 +36,41 @@
 
 namespace arm_compute
 {
+/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls the following kernels:
+ *
+ *  -# @ref NETransposeKernel        (if @p transpose_weights is set to true)
+ *  -# @ref NEGEMMTranspose1xWKernel (if @p is_batched_fc_layer is set to true)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    NEFullyConnectedLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input               Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/F32.
+     * @param[out] output              Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights   True if the weights must be transposed. Data types supported: Same as @p weights.
+     * @param[in]  is_batched_fc_layer True if it is a batched fully connected layer
+     */
+    void configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NETransposeKernel        _transpose_kernel;
+    NEGEMMTranspose1xWKernel _transpose1xW_kernel;
+    Tensor                   _transpose_output;
+    bool                     _transpose_weights;
+    bool                     _is_batched_fc_layer;
+};
+
 /** Basic function to compute a Fully Connected layer on NEON. This function calls the following NEON kernels:
- *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref NETransposeKernel (if @p transpose_weights flag is set to true) (called once)
- *  -# @ref NEGEMMTranspose1xWKernel (called once if we have a multi-batch input)
+ *  -# @ref NEIm2ColKernel                      (called when the input comes from a convolutional layer)
+ *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped flag is set to false) (called once)
  *  -# @ref NEGEMMInterleave4x4Kernel (called if we have a multi-batch input)
  *  -# @ref NEGEMMMatrixMultiplyKernel
  *  -# @ref NEGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
@@ -53,13 +84,14 @@
     NEFullyConnectedLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input             Source tensor. Data type supported: F32.
-     * @param[in]  weights           Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input.
-     * @param[in]  biases            Bias tensor. Can be nullptr. Data type supported:Same as @p input.
-     * @param[out] output            Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  transpose_weights (Optional) Transpose weights if true. Defaults to true.
+     * @param[in]  input                Source tensor. Data type supported: QS8/F32.
+     * @param[in]  weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input.
+     * @param[in]  biases               Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+     * @param[out] output               Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights    (Optional) Transpose the weights tensor if true. Defaults to true.
+     * @param[in]  are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false.
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights = true);
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights = true, bool are_weights_reshaped = false);
 
     //Inherited methods override
     void run() override;
@@ -70,21 +102,18 @@
     void configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output);
     void configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output);
 
-    NEIm2ColKernel                     _im2col_kernel;
-    NETransposeKernel                  _transpose_kernel;
-    NEGEMMTranspose1xWKernel           _transpose1xW_kernel;
-    NEGEMMInterleave4x4Kernel          _interleave4x4_kernel;
-    NEGEMMMatrixMultiplyKernel         _mm_kernel;
-    NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
-    Tensor                             _im2col_output;
-    Tensor                             _interleave4x4_output;
-    Tensor                             _transpose_output;
-    Tensor                             _transpose1xW_output;
-    bool                               _is_first_run;
-    bool                               _transpose_weights;
-    bool                               _fc_after_conv;
-    bool                               _batched_fc_layer;
-    bool                               _accumulate_biases;
+    NEIm2ColKernel                      _im2col_kernel;
+    NEFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+    NEGEMMInterleave4x4Kernel           _interleave4x4_kernel;
+    NEGEMMMatrixMultiplyKernel          _mm_kernel;
+    NEGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
+    Tensor                              _im2col_output;
+    Tensor                              _interleave4x4_output;
+    Tensor                              _reshape_weights_output;
+    bool                                _are_weights_reshaped;
+    bool                                _is_fc_after_conv;
+    bool                                _is_batched_fc_layer;
+    bool                                _accumulate_biases;
 };
 }
 #endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index b9346e7..a40aa91 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -50,9 +50,9 @@
     /** Initialise the kernel's inputs, output
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
-     * @note GEMM: The tensors a, b, c, d must have the same data type. All are either F32 or F16. You should not mix data types when calling this function.
+     * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
      *
-     * @param[in]  a     First input tensor  (Matrix A or Vector A). Data type supported: F32, F16.
+     * @param[in]  a     First input tensor  (Matrix A or Vector A). Data type supported: QS8/F16/F32
      * @param[in]  b     Second input tensor (Matrix B). Data type supported: same as @p a
      * @param[in]  c     Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a
      * @param[out] d     Output tensor. Data type supported: same as @p a
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
index 71fefbf..b911fd0 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
@@ -40,7 +40,7 @@
 public:
     /** Initialise the kernel's inputs, output
      *
-     * @param[in]  input  First input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  First input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
index 69096fb..447b8c9 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
@@ -38,7 +38,7 @@
 public:
     /** Initialise the kernel's inputs, output
      *
-     * @param[in]  input  First input tensor. Data type supported: F32, F16, U8.
+     * @param[in]  input  First input tensor. Data type supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32/
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
index 7487f66..699e42e 100644
--- a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
+++ b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
@@ -64,7 +64,7 @@
 protected:
     NEGaussian5x5HorKernel  _kernel_hor;     /**< kernel for horizontal pass */
     NEGaussian5x5VertKernel _kernel_vert;    /**< kernel for vertical pass */
-    Tensor                  _tmp;            /** temporary buffer for output of horizontal pass */
+    Tensor                  _tmp;            /**< temporary buffer for output of horizontal pass */
     NEFillBorderKernel      _border_handler; /**< kernel to handle tensor borders */
 };
 }
diff --git a/arm_compute/runtime/NEON/functions/NEHOGDetector.h b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
index 46ab72c..98b8a89 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGDetector.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
@@ -40,6 +40,8 @@
 public:
     /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
      *
+     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
+     *
      * @param[in]  input                   Input tensor. It is the output of @ref NEHOGDescriptor. Data type supported: F32
      * @param[in]  hog                     HOG data-object that describes the HOG descriptor
      * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
diff --git a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
index 9440ee0..2d07e64 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
@@ -24,10 +24,10 @@
 #ifndef __ARM_COMPUTE_NEHOGMULTIDETECTION_H__
 #define __ARM_COMPUTE_NEHOGMULTIDETECTION_H__
 
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/core/IMultiHOG.h"
 #include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
 #include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
@@ -41,7 +41,7 @@
  * -# @ref NEHOGOrientationBinningKernel
  * -# @ref NEHOGBlockNormalizationKernel
  * -# @ref NEHOGDetector
- * -# @ref NEHOGNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
+ * -# @ref CPPDetectionWindowNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
  *
  * @note This implementation works if all the HOG data-objects within the IMultiHOG container have the same:
  *       -# Phase type
@@ -85,20 +85,20 @@
     void run() override;
 
 private:
-    NEHOGGradient                                    _gradient_kernel;
-    std::unique_ptr<NEHOGOrientationBinningKernel[]> _orient_bin_kernel;
-    std::unique_ptr<NEHOGBlockNormalizationKernel[]> _block_norm_kernel;
-    std::unique_ptr<NEHOGDetector[]>                 _hog_detect_kernel;
-    std::unique_ptr<NEHOGNonMaximaSuppressionKernel> _non_maxima_kernel;
-    std::unique_ptr<Tensor[]>                        _hog_space;
-    std::unique_ptr<Tensor[]>                        _hog_norm_space;
-    IDetectionWindowArray                           *_detection_windows;
-    Tensor                                           _mag;
-    Tensor                                           _phase;
-    bool                                             _non_maxima_suppression;
-    size_t                                           _num_orient_bin_kernel;
-    size_t                                           _num_block_norm_kernel;
-    size_t                                           _num_hog_detect_kernel;
+    NEHOGGradient                                                 _gradient_kernel;
+    std::unique_ptr<NEHOGOrientationBinningKernel[]>              _orient_bin_kernel;
+    std::unique_ptr<NEHOGBlockNormalizationKernel[]>              _block_norm_kernel;
+    std::unique_ptr<NEHOGDetector[]>                              _hog_detect_kernel;
+    std::unique_ptr<CPPDetectionWindowNonMaximaSuppressionKernel> _non_maxima_kernel;
+    std::unique_ptr<Tensor[]>                                     _hog_space;
+    std::unique_ptr<Tensor[]>                                     _hog_norm_space;
+    IDetectionWindowArray                                        *_detection_windows;
+    Tensor                                                        _mag;
+    Tensor                                                        _phase;
+    bool                                                          _non_maxima_suppression;
+    size_t                                                        _num_orient_bin_kernel;
+    size_t                                                        _num_block_norm_kernel;
+    size_t                                                        _num_hog_detect_kernel;
 };
 }
 
diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
new file mode 100644
index 0000000..1b2b2ee
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class INETensor;
+
+/** Basic function to compute the locally connected layer. This function calls the following NEON kernels:
+ *
+ * -# @ref NEWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref NEIm2ColKernel
+ * -# @ref NELocallyConnectedMatrixMultiplyKernel
+ * -# @ref NECol2ImKernel
+ */
+class NELocallyConnectedLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NELocallyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEIm2ColKernel                         _input_im2col_kernel;
+    NEWeightsReshapeKernel                 _weights_reshape_kernel;
+    NELocallyConnectedMatrixMultiplyKernel _mm_kernel;
+    NECol2ImKernel                         _output_col2im_kernel;
+    Tensor                                 _input_im2col_reshaped;
+    Tensor                                 _weights_reshaped;
+    Tensor                                 _gemm_output;
+    bool                                   _is_first_run;
+};
+}
+#endif /* __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
index e60349a..82e75ee 100644
--- a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
+++ b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
@@ -48,7 +48,7 @@
     NEMinMaxLocation();
     /** Initialise the kernel's inputs and outputs.
      *
-     * @param[in]  input     Input image. Data types supported: U8 or S16.
+     * @param[in]  input     Input image. Data types supported: U8/S16.
      * @param[out] min       Minimum value of image.
      * @param[out] max       Maximum value of image.
      * @param[out] min_loc   (Optional) Array of minimum value locations.
diff --git a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
index 06e4f08..c87d722 100644
--- a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
+++ b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
@@ -45,7 +45,7 @@
      * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
      *       The constant values used with CONSTANT border mode is 0
      *
-     * @param[in, out] input       Source tensor. Data type supported: U8, F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in, out] input       Source tensor. Data type supported: U8/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[out]     output      Destination for the Non-Maxima suppressions 3x3. Data type supported: same as @p input
      * @param[in]      border_mode Border mode to use for non-maxima suppression. The implementation supports just 2 border modes: UNDEFINED and CONSTANT
      *
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index b7be34d..3202867 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -52,7 +52,7 @@
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                       and an optional 4th dimension for batch of inputs. Data type supported: F32. Number of channels must be 1.
+     *                       and an optional 4th dimension for batch of inputs. Data type supported: QS8/F32
      * @param[out] output    Destination with the same dimensions, data type and number of channels of  @p input
      * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
      */
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 835bd13..de7a797 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -37,9 +37,9 @@
 public:
     /** Initialise the kernel's inputs, output and convertion policy.
      *
-     * @param[in]  input1          First tensor input. Data types supported: U8 or S16.
-     * @param[in]  input2          Second tensor input. Data types supported: U8 or S16.
-     * @param[out] output          Output tensor. Data types supported: U8 or S16.
+     * @param[in]  input1          First tensor input. Data types supported: U8/QS8/S16/F32.
+     * @param[in]  input2          Second tensor input. Data types supported: U8/QS8/S16/F32.
+     * @param[out] output          Output tensor. Data types supported: U8/QS8/S16/F32.
      * @param[in]  scale           Scale to apply after multiplication. Must be positive.
      * @param[in]  overflow_policy Overflow policy.
      * @param[in]  rounding_policy Rounding policy.
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 5d67830..5a9cffa 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -42,7 +42,7 @@
 public:
     /** Set the input and output tensors.
      *
-     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: F32.
+     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/F32.
      * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index c67b667..dc84dec 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -50,7 +50,7 @@
     NESoftmaxLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: F32.
+     * @param[in]  input  Source tensor. Data types supported: QS8/F32.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
      */
     void configure(ITensor *input, ITensor *output);
@@ -63,7 +63,6 @@
     NELogits1DShiftExpSumKernel _shift_exp_sum_kernel;
     NELogits1DNormKernel        _norm_kernel;
     NEFillBorderKernel          _fill_border_kernel;
-    NEFillBorderKernel          _fill_border_kernel_sum;
     Tensor                      _max;
     Tensor                      _sum;
     Tensor                      _tmp;
diff --git a/arm_compute/runtime/NEON/functions/NETableLookup.h b/arm_compute/runtime/NEON/functions/NETableLookup.h
index d2f9d30..b59ffb8 100644
--- a/arm_compute/runtime/NEON/functions/NETableLookup.h
+++ b/arm_compute/runtime/NEON/functions/NETableLookup.h
@@ -37,9 +37,9 @@
 public:
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]  input  First tensor input. Data types supported: U8 and S16
+     * @param[in]  input  First tensor input. Data types supported: U8/S16
      * @param[in]  lut    Input lookup table.
-     * @param[out] output Output tensor. Data types supported: U8 and S16.
+     * @param[out] output Output tensor. Data types supported: same as @p input
      */
     void configure(const ITensor *input, const ILut *lut, ITensor *output);
 };
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
index 1b88715..4b606e7 100644
--- a/arm_compute/runtime/NEON/functions/NETranspose.h
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -41,7 +41,7 @@
 public:
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
new file mode 100644
index 0000000..21df6a6
--- /dev/null
+++ b/arm_compute/runtime/OMP/OMPScheduler.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_OMPSCHEDULER_H__
+#define __ARM_COMPUTE_OMPSCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+
+namespace arm_compute
+{
+/** Pool of threads to automatically split a kernel's execution among several threads. */
+class OMPScheduler : public IScheduler
+{
+public:
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads If set to 0, then the number returned by omp_get_max_threads() will be used, otherwise the number of threads specified.
+     */
+    void set_num_threads(unsigned int num_threads) override;
+    /** Returns the number of threads that the OMPScheduler has in its pool.
+     *
+     * @return Number of threads available in OMPScheduler.
+     */
+    unsigned int num_threads() const override;
+    /** Access the scheduler singleton
+     *
+     * @return The scheduler
+     */
+    static OMPScheduler &get();
+    /** Multithread the execution of the passed kernel if possible.
+     *
+     * The kernel will run on a single thread if any of these conditions is true:
+     * - ICPPKernel::is_parallelisable() returns false
+     * - The scheduler has been initialized with only one thread.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
+
+private:
+    /** Constructor. */
+    OMPScheduler();
+
+    unsigned int _num_threads;
+};
+}
+#endif /* __ARM_COMPUTE_OMPSCHEDULER_H__ */
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
new file mode 100644
index 0000000..21f944b
--- /dev/null
+++ b/arm_compute/runtime/Scheduler.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SCHEDULER_H__
+#define __ARM_COMPUTE_SCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Configurable scheduler which supports multiple multithreading APIs and choosing between different schedulers at runtime. */
+class Scheduler
+{
+public:
+    enum class Type
+    {
+        ST,    // Single thread.
+        CPP,   // C++11 threads.
+        OMP,   // OpenMP.
+        CUSTOM // Provided by the user.
+    };
+    /** Sets the user defined scheduler and makes it the active scheduler.
+     *
+     * @param[in] scheduler A shared pointer to a custom scheduler implemented by the user.
+     */
+    static void set(std::shared_ptr<IScheduler> &scheduler);
+    /** Access the scheduler singleton.
+     *
+     * @return A reference to the scheduler object.
+     */
+    static IScheduler &get();
+    /** Set the active scheduler.
+     *
+     * Only one scheduler can be enabled at any time.
+     *
+     * @param[in] t the type of the scheduler to be enabled.
+     */
+    static void set(Type t);
+    /** Returns the type of the active scheduler.
+     *
+     * @return The current scheduler's type.
+     */
+    static Type get_type();
+    /** Returns true if the given scheduler type is supported. False otherwise.
+     *
+     * @return true if the given scheduler type is supported. False otherwise.
+     */
+    static bool is_available(Type t);
+
+private:
+    static Type                        _scheduler_type;
+    static std::shared_ptr<IScheduler> _custom_scheduler;
+    Scheduler();
+};
+}
+#endif /* __ARM_COMPUTE_SCHEDULER_H__ */
diff --git a/arm_compute/runtime/SingleThreadScheduler.h b/arm_compute/runtime/SingleThreadScheduler.h
new file mode 100644
index 0000000..a6e1def
--- /dev/null
+++ b/arm_compute/runtime/SingleThreadScheduler.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__
+#define __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+
+namespace arm_compute
+{
+/** Pool of threads to automatically split a kernel's execution among several threads. */
+class SingleThreadScheduler : public IScheduler
+{
+public:
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads This is ignored for this scheduler as the number of threads is always one.
+     */
+    void set_num_threads(unsigned int num_threads) override;
+    /** Returns the number of threads that the SingleThreadScheduler has, which is always 1.
+     *
+     * @return Number of threads available in SingleThreadScheduler.
+     */
+    unsigned int num_threads() const override;
+    /** Access the scheduler singleton
+     *
+     * @return The scheduler
+     */
+    static SingleThreadScheduler &get();
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
+
+private:
+    /** Constructor. */
+    SingleThreadScheduler() = default;
+};
+}
+#endif /* __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__ */
diff --git a/arm_compute/runtime/SubTensor.h b/arm_compute/runtime/SubTensor.h
new file mode 100644
index 0000000..bdb229d
--- /dev/null
+++ b/arm_compute/runtime/SubTensor.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SUBTENSOR_H__
+#define __ARM_COMPUTE_SUBTENSOR_H__
+
+#include "arm_compute/core/SubTensorInfo.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorInfo;
+
+/** Basic implementation of the sub-tensor interface */
+class SubTensor : public ITensor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] parent       Parent tensor
+     * @param[in] tensor_shape Shape of the subtensor
+     * @param[in] coords       Coordinates of the first subtensor element inside the parent tensor.
+     */
+    SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    /** Destructor: free the tensor's memory */
+    ~SubTensor() = default;
+    /** Restrict instances of this class to be copy constructed */
+    SubTensor(const SubTensor &) = delete;
+    /** Restrict instances of this class to be copied */
+    SubTensor &operator=(const SubTensor &) = delete;
+    /** Allow instances of this class to be move constructed */
+    SubTensor(SubTensor &&) = default;
+    /** Allow instances of this class to be moved */
+    SubTensor &operator=(SubTensor &&) = default;
+    /** Return the parent tensor of the subtensor
+     *
+     * @return Parent tensor
+     */
+    ITensor *parent();
+
+    // Inherited methods overridden:
+    ITensorInfo *info() const override;
+    ITensorInfo *info() override;
+    uint8_t     *buffer() const override;
+
+private:
+    ITensor              *_parent;
+    mutable SubTensorInfo _info;
+};
+}
+#endif /*__ARM_COMPUTE_SUBTENSOR_H__ */
diff --git a/arm_compute/runtime/Tensor.h b/arm_compute/runtime/Tensor.h
index e491635..1fe73a2 100644
--- a/arm_compute/runtime/Tensor.h
+++ b/arm_compute/runtime/Tensor.h
@@ -31,7 +31,7 @@
 
 namespace arm_compute
 {
-class TensorInfo;
+class ITensorInfo;
 
 /** Basic implementation of the tensor interface */
 class Tensor : public ITensor
@@ -52,9 +52,9 @@
     TensorAllocator *allocator();
 
     // Inherited methods overridden:
-    TensorInfo *info() const override;
-    TensorInfo *info() override;
-    uint8_t    *buffer() const override;
+    ITensorInfo *info() const override;
+    ITensorInfo *info() override;
+    uint8_t     *buffer() const override;
 
 private:
     mutable TensorAllocator _allocator; /**< Instance of the basic CPU allocator.*/
diff --git a/arm_compute/runtime/Utils.h b/arm_compute/runtime/Utils.h
new file mode 100644
index 0000000..2f037a0
--- /dev/null
+++ b/arm_compute/runtime/Utils.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_RUNTIME_UTILS_H__
+#define __ARM_COMPUTE_RUNTIME_UTILS_H__
+
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <string>
+
+namespace arm_compute
+{
+/** Convert a Scheduler::Type into a string.
+ *
+ * @param[in] t @ref Scheduler::Type to be translated to string.
+ *
+ * @return The string describing the scheduler type.
+ */
+const std::string &string_from_scheduler_type(Scheduler::Type t);
+}
+#endif /* __ARM_COMPUTE_RUNTIME_UTILS_H__ */