arm_compute v17.06
diff --git a/arm_compute/core/AccessWindowAutoPadding.h b/arm_compute/core/AccessWindowAutoPadding.h
index ef058bc..0a3344b 100644
--- a/arm_compute/core/AccessWindowAutoPadding.h
+++ b/arm_compute/core/AccessWindowAutoPadding.h
@@ -32,7 +32,7 @@
 namespace arm_compute
 {
 class Window;
-class TensorInfo;
+class ITensorInfo;
 
 /** Dummy access window.
  *
@@ -51,7 +51,7 @@
      *
      * @param[in,out] info Tensor info of the accessed kernel.
      */
-    AccessWindowAutoPadding(TensorInfo *info);
+    AccessWindowAutoPadding(ITensorInfo *info);
     AccessWindowAutoPadding(const AccessWindowAutoPadding &) = delete;
     AccessWindowAutoPadding &operator=(const AccessWindowAutoPadding &) = delete;
     AccessWindowAutoPadding(AccessWindowAutoPadding &&)                 = default;
@@ -70,7 +70,7 @@
     ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
 
 private:
-    TensorInfo *_info;
+    ITensorInfo *_info;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__*/
diff --git a/arm_compute/core/AccessWindowStatic.h b/arm_compute/core/AccessWindowStatic.h
index 9c26998..6dcba07 100644
--- a/arm_compute/core/AccessWindowStatic.h
+++ b/arm_compute/core/AccessWindowStatic.h
@@ -34,7 +34,7 @@
 namespace arm_compute
 {
 class Window;
-class TensorInfo;
+class ITensorInfo;
 
 /** Implementation of a static rectangular access pattern.
  *
@@ -54,7 +54,7 @@
      * @param[in]     end_x   End of the access in X direction.
      * @param[in]     end_y   End of the access in Y direction.
      */
-    AccessWindowStatic(TensorInfo *info, int start_x, int start_y, int end_x, int end_y);
+    AccessWindowStatic(ITensorInfo *info, int start_x, int start_y, int end_x, int end_y);
 
     AccessWindowStatic(const AccessWindowStatic &) = delete;
     AccessWindowStatic &operator=(const AccessWindowStatic &) = delete;
@@ -82,11 +82,11 @@
     bool update_padding_if_needed(const Window &window) const override;
     ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
 
-    TensorInfo *_info;
-    int         _start_x;
-    int         _start_y;
-    int         _end_x;
-    int         _end_y;
+    ITensorInfo *_info;
+    int          _start_x;
+    int          _start_y;
+    int          _end_x;
+    int          _end_y;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__*/
diff --git a/arm_compute/core/AccessWindowTranspose.h b/arm_compute/core/AccessWindowTranspose.h
index 4276503..102860f 100644
--- a/arm_compute/core/AccessWindowTranspose.h
+++ b/arm_compute/core/AccessWindowTranspose.h
@@ -32,7 +32,7 @@
 namespace arm_compute
 {
 class Window;
-class TensorInfo;
+class ITensorInfo;
 
 /** Implementation of a XY-transpose access pattern. */
 class AccessWindowTranspose : public AccessWindowRectangle
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index 230685c..26253e3 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -24,11 +24,22 @@
 #ifndef __ARM_COMPUTE_CLHELPERS_H__
 #define __ARM_COMPUTE_CLHELPERS_H__
 
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+
 #include <string>
 
 namespace arm_compute
 {
 enum class DataType;
+enum class GPUTarget;
+
+/** Enable operation operations on GPUTarget enumerations */
+template <>
+struct enable_bitwise_ops<arm_compute::GPUTarget>
+{
+    static constexpr bool value = true;
+};
 
 /** Max vector width of an OpenCL vector */
 static constexpr const unsigned int max_cl_vector_width = 16;
@@ -40,5 +51,55 @@
  * @return The string specifying the OpenCL type to be used.
  */
 std::string get_cl_type_from_data_type(const DataType &dt);
+
+/** Translates a given gpu device target to string.
+ *
+ * @param[in] target Given gpu target.
+ *
+ * @return The string describing the target.
+ */
+const std::string &string_from_target(GPUTarget target);
+
+/** Helper function to create and return a unique_ptr pointed to a CL kernel object
+ *  It also calls the kernel's configuration.
+ *
+ * @param[in] args All the arguments that need pass to kernel's configuration.
+ *
+ * @return A unique pointer pointed to a CL kernel object
+ */
+template <typename Kernel, typename... T>
+std::unique_ptr<Kernel> create_configure_kernel(T &&... args)
+{
+    std::unique_ptr<Kernel> k = arm_compute::cpp14::make_unique<Kernel>();
+    k->configure(std::forward<T>(args)...);
+    return k;
+}
+
+/** Helper function to create and return a unique_ptr pointed to a CL kernel object
+ *
+ * @return A unique pointer pointed to a CL kernel object
+ */
+template <typename Kernel>
+std::unique_ptr<Kernel> create_kernel()
+{
+    std::unique_ptr<Kernel> k = arm_compute::cpp14::make_unique<Kernel>();
+    return k;
+}
+
+/** Helper function to get the GPU target from CL device
+ *
+ * @param[in] device A CL device
+ *
+ * @return the GPU target
+ */
+GPUTarget get_target_from_device(cl::Device &device);
+
+/** Helper function to get the GPU arch
+ *
+ * @param[in] target GPU target
+ *
+ * @return the GPU target which shows the arch
+ */
+GPUTarget get_arch_from_target(GPUTarget target);
 }
 #endif
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 70789b2..0e9f356 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -30,6 +30,7 @@
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
 #include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
 #include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
 #include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
@@ -41,7 +42,7 @@
 #include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
 #include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
 #include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
-#include "arm_compute/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
 #include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
 #include "arm_compute/core/CL/kernels/CLDilateKernel.h"
@@ -62,6 +63,7 @@
 #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
 #include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
 #include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
 #include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
@@ -83,5 +85,6 @@
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 #include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
 #include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
 
 #endif /* __ARM_COMPUTE_CLKERNELS_H__ */
diff --git a/arm_compute/core/CL/CLTypes.h b/arm_compute/core/CL/CLTypes.h
new file mode 100644
index 0000000..c5643d8
--- /dev/null
+++ b/arm_compute/core/CL/CLTypes.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CL_TYPES_H__
+#define __ARM_COMPUTE_CL_TYPES_H__
+
+namespace arm_compute
+{
+/** Available GPU Targets */
+enum class GPUTarget
+{
+    GPU_ARCH_MASK = 0xF00,
+    MIDGARD       = 0x100,
+    BIFROST       = 0x200,
+    T600          = 0x110,
+    T700          = 0x120,
+    T800          = 0x130,
+    G70           = 0x210
+};
+}
+#endif /* __ARM_COMPUTE_CL_TYPES_H__ */
diff --git a/arm_compute/core/CL/ICLHOG.h b/arm_compute/core/CL/ICLHOG.h
new file mode 100644
index 0000000..a3d2fb4
--- /dev/null
+++ b/arm_compute/core/CL/ICLHOG.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLHOG_H__
+#define __ARM_COMPUTE_ICLHOG_H__
+
+#include "arm_compute/core/IHOG.h"
+
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+class CommandQueue;
+}
+
+namespace arm_compute
+{
+/** Interface for OpenCL HOG data-object */
+class ICLHOG : public IHOG
+{
+public:
+    /** Default constructor */
+    ICLHOG();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLHOG(const ICLHOG &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLHOG &operator=(const ICLHOG &) = delete;
+    /** Allow instances of this class to be moved */
+    ICLHOG(ICLHOG &&) = default;
+    /** Allow instances of this class to be moved */
+    ICLHOG &operator=(ICLHOG &&) = default;
+    /** Default destructor */
+    virtual ~ICLHOG() = default;
+
+    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the hog's descriptor
+     *
+     * @return A reference to an OpenCL buffer containing the hog's descriptor
+     */
+    virtual const cl::Buffer &cl_buffer() const = 0;
+
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    void map(cl::CommandQueue &q, bool blocking = true);
+
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    void unmap(cl::CommandQueue &q);
+
+    /** Interface to be implemented by the child class to free the allocated cl buffer.
+     *
+     * @warning The buffer must have been allocated previously. Otherwise calling the function will fail.
+     */
+    virtual void free() = 0;
+
+    // Inherited methods overridden:
+    float *descriptor() const override;
+
+protected:
+    /** Method to be implemented by the child class to map the OpenCL buffer
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the OpenCL buffer
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    virtual void do_unmap(cl::CommandQueue &q) = 0;
+
+private:
+    uint8_t *_mapping;
+};
+}
+#endif /*__ARM_COMPUTE_ICLHOG_H__ */
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
index f2cbb2b..72c963d 100644
--- a/arm_compute/core/CL/ICLKernel.h
+++ b/arm_compute/core/CL/ICLKernel.h
@@ -24,6 +24,7 @@
 #ifndef __ARM_COMPUTE_ICLKERNEL_H__
 #define __ARM_COMPUTE_ICLKERNEL_H__
 
+#include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/IKernel.h"
 
@@ -98,6 +99,24 @@
         _kernel.setArg(idx++, value);
     }
 
+    /** Set the targeted GPU architecture
+     *
+     * @param[in] target The targeted GPU architecture
+     */
+    void set_target(GPUTarget target);
+
+    /** Set the targeted GPU architecture according to the CL device
+     *
+     * @param[in] device A CL device
+     */
+    void set_target(cl::Device &device);
+
+    /** Get the targeted GPU architecture
+     *
+     * @return The targeted GPU architecture.
+     */
+    GPUTarget get_target() const;
+
 private:
     /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
@@ -117,6 +136,7 @@
 protected:
     cl::Kernel  _kernel;   /**< OpenCL kernel to run */
     cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
+    GPUTarget   _target;   /**< The targeted GPU */
 };
 
 /** Add the kernel to the command queue with the given window.
diff --git a/arm_compute/core/CL/ICLMultiHOG.h b/arm_compute/core/CL/ICLMultiHOG.h
new file mode 100644
index 0000000..9f3c775
--- /dev/null
+++ b/arm_compute/core/CL/ICLMultiHOG.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLMULTIHOG_H__
+#define __ARM_COMPUTE_ICLMULTIHOG_H__
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/IMultiHOG.h"
+
+namespace arm_compute
+{
+/** Interface for storing multiple HOG data-objects */
+class ICLMultiHOG : public IMultiHOG
+{
+public:
+    /** Return a pointer to the requested OpenCL HOG model
+     *
+     *  @param[in] index The index of the wanted OpenCL HOG model.
+     *
+     *  @return A pointer pointed to the HOG model
+     */
+    virtual ICLHOG *cl_model(size_t index) = 0;
+    /** Return a constant pointer to the requested OpenCL HOG model
+     *
+     *  @param[in] index The index of the wanted OpenCL HOG model.
+     *
+     *  @return A constant pointer pointed to the OpenCL HOG model
+     */
+    virtual const ICLHOG *cl_model(size_t index) const = 0;
+
+    // Inherited methods overridden:
+    IHOG *model(size_t index) override;
+    const IHOG *model(size_t index) const override;
+};
+}
+#endif /*__ARM_COMPUTE_ICLMULTIHOG_H__ */
diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
index 887d31f..490e705 100644
--- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
@@ -24,14 +24,14 @@
 #ifndef __ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__
 #define __ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/CL/ICLSimple3DKernel.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
 /** Interface for the activation layer kernel. */
-class CLActivationLayerKernel : public ICLSimple2DKernel
+class CLActivationLayerKernel : public ICLSimple3DKernel
 {
 public:
     /** Set the input and output tensor.
diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
new file mode 100644
index 0000000..0888538
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the BatchNormalization layer kernel.
+ */
+class CLBatchNormalizationLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLBatchNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchNormalizationLayerKernel(const CLBatchNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchNormalizationLayerKernel &operator=(const CLBatchNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLBatchNormalizationLayerKernel(CLBatchNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    CLBatchNormalizationLayerKernel &operator=(CLBatchNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLBatchNormalizationLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM]. Data types supported: F32.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     *                     The rest are optional and used for representing batches.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_mean;
+    const ICLTensor *_var;
+    const ICLTensor *_beta;
+    const ICLTensor *_gamma;
+    float            _epsilon;
+};
+}
+#endif /*__ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h
new file mode 100644
index 0000000..eda4c66
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H__
+#define __ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the depth concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class CLDepthConcatenateKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLDepthConcatenateKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthConcatenateKernel(const CLDepthConcatenateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthConcatenateKernel &operator=(const CLDepthConcatenateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLDepthConcatenateKernel(CLDepthConcatenateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLDepthConcatenateKernel &operator=(CLDepthConcatenateKernel &&) = default;
+    /** Default destructor */
+    ~CLDepthConcatenateKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor. Data types supported: F32.
+     * @param[in]     depth_offset The offset on the Z axis.
+     * @param[in,out] output       Output tensor. Data types supported: F32.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    int              _top_bottom;
+    int              _left_right;
+};
+}
+#endif /* __ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
index f70a0ae..8d44a4c 100644
--- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
@@ -76,6 +76,9 @@
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
 };
 }
 #endif /* __ARM_COMPUTE_CLGEMMTRANSPOSE1XWKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h b/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
new file mode 100644
index 0000000..45a5aac
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H__
+#define __ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/Size2D.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** OpenCL kernel to perform HOG Orientation Binning */
+class CLHOGOrientationBinningKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGOrientationBinningKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGOrientationBinningKernel(const CLHOGOrientationBinningKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGOrientationBinningKernel &operator=(const CLHOGOrientationBinningKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGOrientationBinningKernel(CLHOGOrientationBinningKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGOrientationBinningKernel &operator=(CLHOGOrientationBinningKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGOrientationBinningKernel() = default;
+
+    /**  Initialise the kernel's inputs, output and HOG's metadata
+     *
+     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
+     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
+     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[in]  hog_info        HOG's metadata
+     */
+    void configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input_magnitude;
+    const ICLTensor *_input_phase;
+    ICLTensor       *_output;
+    Size2D           _cell_size;
+};
+
+/** OpenCL kernel to perform HOG block normalization */
+class CLHOGBlockNormalizationKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGBlockNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGBlockNormalizationKernel(const CLHOGBlockNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGBlockNormalizationKernel &operator=(const CLHOGBlockNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGBlockNormalizationKernel(CLHOGBlockNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGBlockNormalizationKernel &operator=(CLHOGBlockNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGBlockNormalizationKernel() = default;
+
+    /** Initialise the kernel's input, output and HOG's metadata
+     *
+     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog_info HOG's metadata
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    Size2D           _num_cells_per_block_stride;
+};
+}
+#endif /* __ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h b/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
new file mode 100644
index 0000000..47bd054
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDETECTORKERNEL_H__
+#define __ARM_COMPUTE_CLHOGDETECTORKERNEL_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/OpenCL.h"
+
+namespace cl
+{
+class Buffer;
+}
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform HOG detector kernel using linear SVM */
+class CLHOGDetectorKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGDetectorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetectorKernel(const CLHOGDetectorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetectorKernel &operator=(const CLHOGDetectorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGDetectorKernel(CLHOGDetectorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGDetectorKernel &operator=(CLHOGDetectorKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGDetectorKernel() = default;
+
+    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
+     *
+     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
+     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
+     * @param[in]  num_detection_windows   Number of detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the hog->info()->block_stride()
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f,
+                   uint16_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue);
+
+private:
+    const ICLTensor         *_input;
+    ICLDetectionWindowArray *_detection_windows;
+    cl::Buffer              *_num_detection_windows;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGDETECTORKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
new file mode 100644
index 0000000..fda0327
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply each row of first tensor with low 2 dimensions of second tensor.
+ *
+ * @attention The second input tensor must have at least 2 dimensions (matrix)
+ *
+ */
+class CLLocallyConnectedMatrixMultiplyKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLocallyConnectedMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLocallyConnectedMatrixMultiplyKernel(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLocallyConnectedMatrixMultiplyKernel &operator=(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLocallyConnectedMatrixMultiplyKernel(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLocallyConnectedMatrixMultiplyKernel &operator=(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input, output and alpha
+     *
+     * @param[in]  input0 First input tensor. Data types supported: F32
+     * @param[in]  input1 Second input tensor. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result. Data type supported: same as @p input0
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
index dd96aae..6fbbe95 100644
--- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H__
-#define __ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H__
+#ifndef __ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H__
+#define __ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H__
 
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
@@ -70,4 +70,4 @@
 };
 }
 
-#endif /*__ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H__ */
+#endif /*__ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
similarity index 64%
rename from arm_compute/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.h
rename to arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
index 0ad0c0d..1dc8a8b 100644
--- a/arm_compute/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.h
+++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
@@ -21,13 +21,52 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_CLCONVOLUTIONLAYERWEIGHTSRESHAPEKERNEL_H__
-#define __ARM_COMPUTE_CLCONVOLUTIONLAYERWEIGHTSRESHAPEKERNEL_H__
+#ifndef __ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H__
+#define __ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H__
 
 #include "arm_compute/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
+class CLWeightsReshapeKernel : public ICLKernel
+{
+public:
+    /** Constructor.
+     *
+     * @param[in] is_shared Flag to indicate whether the weights are shared or not.
+     */
+    CLWeightsReshapeKernel(bool is_shared = false);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWeightsReshapeKernel(const CLWeightsReshapeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWeightsReshapeKernel &operator=(const CLWeightsReshapeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWeightsReshapeKernel(CLWeightsReshapeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWeightsReshapeKernel &operator=(CLWeightsReshapeKernel &&) = default;
+    /** Default destructor */
+    ~CLWeightsReshapeKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: F16, F32
+     * @param[in]  biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
+     */
+    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output);
+
+    // Inherited methods overridden:
+    virtual void run(const Window &window, cl::CommandQueue &queue) = 0;
+
+protected:
+    bool             _is_shared;
+    const ICLTensor *_input;
+    const ICLTensor *_biases;
+    ICLTensor       *_output;
+};
+
 /** Interface for the weights reshape kernel used by convolution and fully connected layers.
  *
  * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
@@ -51,37 +90,25 @@
  * \end{array} \right)
  * @f]
  */
-class CLConvolutionLayerWeightsReshapeKernel : public ICLKernel
+class CLConvolutionLayerWeightsReshapeKernel : public CLWeightsReshapeKernel
 {
 public:
     /** Default constructor */
     CLConvolutionLayerWeightsReshapeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionLayerWeightsReshapeKernel(const CLConvolutionLayerWeightsReshapeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionLayerWeightsReshapeKernel &operator=(const CLConvolutionLayerWeightsReshapeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLConvolutionLayerWeightsReshapeKernel(CLConvolutionLayerWeightsReshapeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLConvolutionLayerWeightsReshapeKernel &operator=(CLConvolutionLayerWeightsReshapeKernel &&) = default;
-    /** Default destructor */
-    ~CLConvolutionLayerWeightsReshapeKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].  Data types supported: F16, F32
-     * @param[in]  biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM]. Data types supported: Same as @p input
-     * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
-     */
-    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
+};
 
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_biases;
-    ICLTensor       *_output;
+/** Interface for the weights reshape kernel used by locally connected layers. */
+class CLLocallyConnectedLayerWeightsReshapeKernel : public CLWeightsReshapeKernel
+{
+public:
+    /** Default constructor */
+    CLLocallyConnectedLayerWeightsReshapeKernel();
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
 };
 }
-#endif /*__ARM_COMPUTE_CLCONVOLUTIONLAYERWEIGHTSRESHAPEKERNEL_H__ */
+#endif /*__ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/arm_compute/core/CPP/CPPKernels.h b/arm_compute/core/CPP/CPPKernels.h
index 213a9e6..1eabfa9 100644
--- a/arm_compute/core/CPP/CPPKernels.h
+++ b/arm_compute/core/CPP/CPPKernels.h
@@ -26,6 +26,7 @@
 
 /* Header regrouping all the CPP kernels */
 #include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
 
 #endif /* __ARM_COMPUTE_CPPKERNELS_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
similarity index 61%
rename from arm_compute/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.h
rename to arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
index c602f06..bcb3026 100644
--- a/arm_compute/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_NEHOGNONMAXIMASUPPRESSIONKERNEL_H__
-#define __ARM_COMPUTE_NEHOGNONMAXIMASUPPRESSIONKERNEL_H__
+#ifndef __ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H__
+#define __ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H__
 
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/core/IHOG.h"
@@ -31,26 +31,29 @@
 
 namespace arm_compute
 {
-/** NEON kernel to perform in-place computation of euclidean distance based non-maxima suppression for HOG
+/** CPP kernel to perform in-place computation of euclidean distance on IDetectionWindowArray
  *
- * @note This kernel is meant to be used alongside HOG and performs a non-maxima suppression on a
- *       HOG detection window.
+ * @note This kernel is meant to be used alongside HOG or other object detection algorithms to perform a non-maxima suppression on a
+ *       IDetectionWindowArray
  */
-class NEHOGNonMaximaSuppressionKernel : public INEKernel
+class CPPDetectionWindowNonMaximaSuppressionKernel : public ICPPKernel
 {
 public:
     /** Default constructor */
-    NEHOGNonMaximaSuppressionKernel();
+    CPPDetectionWindowNonMaximaSuppressionKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGNonMaximaSuppressionKernel(const NEHOGNonMaximaSuppressionKernel &) = delete;
+    CPPDetectionWindowNonMaximaSuppressionKernel(const CPPDetectionWindowNonMaximaSuppressionKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGNonMaximaSuppressionKernel &operator=(const NEHOGNonMaximaSuppressionKernel &) = delete;
+    CPPDetectionWindowNonMaximaSuppressionKernel &operator=(const CPPDetectionWindowNonMaximaSuppressionKernel &) = delete;
     /** Allow instances of this class to be moved */
-    NEHOGNonMaximaSuppressionKernel(NEHOGNonMaximaSuppressionKernel &&) = default;
+    CPPDetectionWindowNonMaximaSuppressionKernel(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
     /** Allow instances of this class to be moved */
-    NEHOGNonMaximaSuppressionKernel &operator=(NEHOGNonMaximaSuppressionKernel &&) = default;
+    CPPDetectionWindowNonMaximaSuppressionKernel &operator=(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
     /** Initialise the kernel's input, output and the euclidean minimum distance
      *
+     * @attention: If @ref CLDetectionWindowArray is passed to the kernel, the map() and unmap() methods @ref CLDetectionWindowArray must be called respectively before and after
+     *             the run() method of @ref CPPDetectionWindowNonMaximaSuppressionKernel
+     *
      * @param[in, out] input_output Input/Output array of @ref DetectionWindow
      * @param[in]      min_distance Radial Euclidean distance for non-maxima suppression
      */
@@ -66,4 +69,4 @@
 };
 }
 
-#endif /* __ARM_COMPUTE_NEHOGNONMAXIMASUPPRESSIONKERNEL_H__ */
+#endif /* __ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H__ */
diff --git a/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h b/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
index dab0192..b7a7d9f 100644
--- a/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
@@ -24,8 +24,8 @@
 #ifndef __ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H__
 #define __ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H__
 
+#include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 
 #include <cstdint>
 #include <mutex>
@@ -33,7 +33,7 @@
 namespace arm_compute
 {
 /** CPP kernel to perform sorting and euclidean distance */
-class CPPSortEuclideanDistanceKernel : public INEKernel
+class CPPSortEuclideanDistanceKernel : public ICPPKernel
 {
 public:
     /** Default constructor */
@@ -63,7 +63,7 @@
     const int32_t    *_num_corner_candidates; /**< Number of corner candidates */
     float             _min_distance;          /**< Radial Euclidean distance */
     InternalKeypoint *_in_out;                /**< Source array of InternalKeypoint */
-    IKeyPointArray   *_output;                /**< Destination array of NEKeyPointArray */
+    IKeyPointArray   *_output;                /**< Destination array of IKeyPointArray */
 };
 
 } // namespace arm_compute
diff --git a/arm_compute/core/Coordinates.h b/arm_compute/core/Coordinates.h
index c936265..3a99abb 100644
--- a/arm_compute/core/Coordinates.h
+++ b/arm_compute/core/Coordinates.h
@@ -37,7 +37,6 @@
 class Coordinates : public Dimensions<int>
 {
 public:
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
     /** Constructor to initialize the coordinates.
      *
      * @param[in] coords Values to initialize the dimensions.
@@ -47,7 +46,6 @@
         : Dimensions{ coords... }
     {
     }
-#endif
     /** Allow instances of this class to be copy constructed */
     constexpr Coordinates(const Coordinates &) = default;
     /** Allow instances of this class to be copied */
diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h
index d627517..b080435 100644
--- a/arm_compute/core/Dimensions.h
+++ b/arm_compute/core/Dimensions.h
@@ -44,7 +44,6 @@
     /** Number of dimensions the tensor has */
     static constexpr size_t num_max_dimensions = MAX_DIMS;
 
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
     /** Constructor to initialize the tensor shape.
      *
      * @param[in] dims Values to initialize the dimensions.
@@ -54,17 +53,19 @@
         : _id{ { dims... } }, _num_dimensions{ sizeof...(dims) }
     {
     }
-#endif
+
     /** Allow instances of this class to be copy constructed */
     Dimensions(const Dimensions &) = default;
+
     /** Allow instances of this class to be copied */
     Dimensions &operator=(const Dimensions &) = default;
+
     /** Allow instances of this class to be move constructed */
     Dimensions(Dimensions &&) = default;
+
     /** Allow instances of this class to be moved */
     Dimensions &operator=(Dimensions &&) = default;
-    /** Pure virtual destructor */
-    virtual ~Dimensions() = 0;
+
     /** Accessor to set the value of one of the dimensions.
      *
      * @param[in] dimension Dimension for which the value is set.
@@ -105,17 +106,36 @@
         return _id[dimension];
     }
     /** Returns the effective dimensionality of the tensor */
-    inline unsigned int num_dimensions() const
+    unsigned int num_dimensions() const
     {
         return _num_dimensions;
     }
 
     /** Set number of dimensions */
-    inline void set_num_dimensions(size_t num_dimensions)
+    void set_num_dimensions(size_t num_dimensions)
     {
         _num_dimensions = num_dimensions;
     }
 
+    /** Collapse dimensions.
+     *
+     * @param[in] first Dimensions into which the following @p n are collapsed.
+     * @param[in] n     Number of dimensions to collapse into @p first.
+     */
+    void collapse(size_t n, size_t first = 0)
+    {
+        ARM_COMPUTE_ERROR_ON(first + n > _id.size());
+
+        // Collapse dimensions into the first
+        _id[first] = std::accumulate(_id.cbegin() + first, _id.cbegin() + first + n, 1, std::multiplies<T>());
+        // Shift the remaining dimensions down
+        std::copy(_id.begin() + first + n, _id.end(), _id.begin() + first + 1);
+        // Reduce the number of dimensions
+        _num_dimensions -= n - 1;
+        // Fill the now empty dimensions with zero
+        std::fill(_id.begin() + _num_dimensions, _id.end(), 0);
+    }
+
     /** Returns a read/write iterator that points to the first element in the dimension array. */
     typename std::array<T, num_max_dimensions>::iterator begin()
     {
@@ -148,13 +168,11 @@
     }
 
 protected:
+    /** Protected destructor. */
+    ~Dimensions() = default;
+
     std::array<T, num_max_dimensions> _id;
     size_t _num_dimensions{ 0 };
 };
-
-template <typename T>
-inline Dimensions<T>::~Dimensions()
-{
-}
 }
 #endif /*__ARM_COMPUTE_DIMENSIONS_H__*/
diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h
index a589501..c4c452b 100644
--- a/arm_compute/core/Error.h
+++ b/arm_compute/core/Error.h
@@ -48,6 +48,30 @@
  */
 #define ARM_COMPUTE_UNUSED(var) (void)(var)
 
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+/** Print the given message
+ *
+ * @param[in] ... Message to display
+ */
+#define ARM_COMPUTE_INFO(...) ::arm_compute::debug(__func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT
+/** If the condition is true, the given message is printed
+ *
+ * @param[in] cond Condition to evaluate.
+ * @param[in] ...  Message to print if cond is false.
+ */
+#define ARM_COMPUTE_INFO_ON_MSG(cond, ...) \
+    do                                     \
+    {                                      \
+        if(cond)                           \
+        {                                  \
+            ARM_COMPUTE_INFO(__VA_ARGS__); \
+        }                                  \
+    } while(0)
+#else /* ARM_COMPUTE_DEBUG_ENABLED */
+#define ARM_COMPUTE_INFO_ON_MSG(cond, ...)
+#define ARM_COMPUTE_INFO(...)
+#endif /* ARM_COMPUTE_DEBUG_ENABLED */
+
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
 /** If the condition is true, the given message is printed and an exception is thrown
  *
@@ -121,6 +145,16 @@
  * @param[in] ...      Variable number of arguments of the message.
  */
 [[noreturn]] void error(const char *function, const char *file, const int line, const char *msg, ...);
+
+/** Print a debug message
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] msg      Message to display before aborting.
+ * @param[in] ...      Variable number of arguments of the message.
+ */
+void debug(const char *function, const char *file, const int line, const char *msg, ...);
 }
 
 #endif /* __ARM_COMPUTE_ERROR_H__ */
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h
new file mode 100644
index 0000000..925b494
--- /dev/null
+++ b/arm_compute/core/FixedPoint.h
@@ -0,0 +1,217 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_FIXEDPOINT_H__
+#define __ARM_COMPUTE_FIXEDPOINT_H__
+
+#include <cstdint>
+
+namespace arm_compute
+{
+using qint8_t  = int8_t;  /**< 8 bit fixed point scalar value */
+using qint16_t = int16_t; /**< 16 bit fixed point scalar value */
+using qint32_t = int32_t; /**< 32 bit fixed point scalar value */
+
+/** 8 bit fixed point scalar saturating shift left
+ *
+ * @param[in] a     First 8 bit fixed point input
+ * @param[in] shift Shift amount
+ *
+ * @return The result of the 8 bit fixed point shift. The result is saturated in case of overflow
+ */
+qint8_t sqshl_qs8(qint8_t a, int shift);
+
+/** 8 bit fixed point scalar absolute value
+ *
+ * @param[in] a 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point absolute value
+ */
+qint8_t sabs_qs8(qint8_t a);
+
+/** 8 bit fixed point scalar add
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point addition
+ */
+qint8_t sadd_qs8(qint8_t a, qint8_t b);
+
+/** 8 bit fixed point scalar saturating add
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point addition. The result is saturated in case of overflow
+ */
+qint8_t sqadd_qs8(qint8_t a, qint8_t b);
+
+/** 16 bit fixed point scalar saturating add
+ *
+ * @param[in] a First 16 bit fixed point input
+ * @param[in] b Second 16 bit fixed point input
+ *
+ * @return The result of the 16 bit fixed point addition. The result is saturated in case of overflow
+ */
+qint16_t sqadd_qs16(qint16_t a, qint16_t b);
+
+/** 8 bit fixed point scalar subtraction
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point subtraction
+ */
+qint8_t ssub_qs8(qint8_t a, qint8_t b);
+
+/** 8 bit fixed point scalar saturating subtraction
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point subtraction. The result is saturated in case of overflow
+ */
+qint8_t sqsub_qs8(qint8_t a, qint8_t b);
+
+/** 8 bit fixed point scalar multiply
+ *
+ * @param[in] a                    First 8 bit fixed point input
+ * @param[in] b                    Second 8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point multiplication.
+ */
+qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar saturating multiply
+ *
+ * @param[in] a                    First 8 bit fixed point input
+ * @param[in] b                    Second 8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point multiplication. The result is saturated in case of overflow
+ */
+qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar multiply long
+ *
+ * @param[in] a                    First 8 bit fixed point input
+ * @param[in] b                    Second 8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point multiplication long. The result is saturated in case of overflow
+ */
+qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 16 bit fixed point scalar saturating multiply
+*
+* @param[in] a                    First 16 bit fixed point input
+* @param[in] b                    Second 16 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow
+*/
+qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar inverse square root
+*
+* @param[in] a                    8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point inverse square root.
+*/
+qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position);
+
+/** 8 bit fixed point scalar division
+*
+* @param[in] a                    First 8 bit fixed point input
+* @param[in] b                    Second 8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point division.
+*/
+qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar exponential
+*
+* @param[in] a                    8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point exponential.
+*/
+qint8_t sexp_qs8(qint8_t a, int fixed_point_position);
+
+/** 8 bit fixed point scalar logarithm
+*
+* @param[in] a                    8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point logarithm.
+*/
+qint8_t slog_qs8(qint8_t a, int fixed_point_position);
+
+/** Convert an 8 bit fixed point to float
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 8 bit fixed point -> float
+ */
+float scvt_f32_qs8(qint8_t a, int fixed_point_position);
+
+/** Convert a float to 8 bit fixed point
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 8 bit fixed point
+ */
+qint8_t scvt_qs8_f32(float a, int fixed_point_position);
+
+/** Convert a 16 bit fixed point to float
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 16 bit fixed point -> float
+ */
+float scvt_f32_qs16(qint16_t a, int fixed_point_position);
+
+/** Convert a float to 16 bit fixed point
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 16 bit fixed point
+ */
+qint8_t scvt_qs16_f32(float a, int fixed_point_position);
+
+/** Scalar saturating move and narrow.
+ *
+ * @param[in] a Input to convert to 8 bit fixed point
+ *
+ * @return The narrowing conversion to 8 bit
+ */
+qint8_t sqmovn_qs16(qint16_t a);
+}
+#include "arm_compute/core/FixedPoint.inl"
+#endif /* __ARM_COMPUTE_FIXEDPOINT_H__ */
diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl
new file mode 100644
index 0000000..4263a6f
--- /dev/null
+++ b/arm_compute/core/FixedPoint.inl
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <cmath>
+#include <limits>
+
+namespace
+{
+template <typename TpIn, typename TpSat>
+inline TpSat saturate_convert(TpIn a)
+{
+    if(a > std::numeric_limits<TpSat>::max())
+    {
+        a = std::numeric_limits<TpSat>::max();
+    }
+    if(a < std::numeric_limits<TpSat>::min())
+    {
+        a = std::numeric_limits<TpSat>::min();
+    }
+    return static_cast<TpSat>(a);
+}
+} // namespace
+
+namespace arm_compute
+{
+inline qint8_t sqshl_qs8(qint8_t a, int shift)
+{
+    qint16_t tmp = static_cast<qint16_t>(a) << shift;
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(tmp);
+}
+
+inline qint8_t sabs_qs8(qint8_t a)
+{
+    return a & 0x7F;
+}
+
+inline qint8_t sadd_qs8(qint8_t a, qint8_t b)
+{
+    return a + b;
+}
+
+inline qint8_t sqadd_qs8(qint8_t a, qint8_t b)
+{
+    // We need to store the temporary result in qint16_t otherwise we cannot evaluate the overflow
+    qint16_t tmp = (static_cast<qint16_t>(a) + static_cast<qint16_t>(b));
+
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(tmp);
+}
+
+inline qint16_t sqadd_qs16(qint16_t a, qint16_t b)
+{
+    // We need to store the temporary result in qint16_t otherwise we cannot evaluate the overflow
+    qint32_t tmp = (static_cast<qint32_t>(a) + static_cast<qint32_t>(b));
+
+    // Saturate the result in case of overflow and cast to qint16_t
+    return saturate_convert<qint32_t, qint16_t>(tmp);
+}
+
+inline qint8_t ssub_qs8(qint8_t a, qint8_t b)
+{
+    return a - b;
+}
+
+inline qint8_t sqsub_qs8(qint8_t a, qint8_t b)
+{
+    // We need to store the temporary result in uint16_t otherwise we cannot evaluate the overflow
+    qint16_t tmp = static_cast<qint16_t>(a) - static_cast<qint16_t>(b);
+
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(tmp);
+}
+
+inline qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    const qint16_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return static_cast<qint8_t>(tmp >> fixed_point_position);
+}
+
+inline qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    const qint16_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return saturate_convert<qint16_t, qint8_t>(tmp >> fixed_point_position);
+}
+
+inline qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position)
+{
+    const qint32_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint32_t tmp = static_cast<qint32_t>(a) * static_cast<qint32_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return saturate_convert<qint32_t, qint16_t>(tmp >> fixed_point_position);
+}
+
+inline qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    const qint16_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return tmp >> fixed_point_position;
+}
+
+inline qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position)
+{
+    qint8_t shift = 8 - (fixed_point_position + (__builtin_clz(a) - 24));
+
+    qint8_t const_three = (3 << fixed_point_position);
+    qint8_t temp        = shift < 0 ? (a << -shift) : (a >> shift);
+    qint8_t x2          = temp;
+
+    // We need three iterations to find the result
+    for(int i = 0; i < 3; i++)
+    {
+        qint8_t three_minus_dx = ssub_qs8(const_three, smul_qs8(temp, smul_qs8(x2, x2, fixed_point_position), fixed_point_position));
+        x2                     = (smul_qs8(x2, three_minus_dx, fixed_point_position) >> 1);
+    }
+
+    temp = shift < 0 ? (x2 << (-shift >> 1)) : (x2 >> (shift >> 1));
+
+    return temp;
+}
+
+inline qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    qint16_t temp = a << fixed_point_position;
+    return (qint8_t)(temp / b);
+}
+
+inline qint8_t sqexp_qs8(qint8_t a, int fixed_point_position)
+{
+    // Constants
+    qint8_t const_one = (1 << fixed_point_position);
+    qint8_t ln2       = ((0x58 >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t inv_ln2   = (((0x38 >> (6 - fixed_point_position)) + 1) >> 1) | const_one;
+    qint8_t A         = ((0x7F >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t B         = ((0x3F >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t C         = ((0x16 >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t D         = ((0x05 >> (6 - fixed_point_position)) + 1) >> 1;
+
+    // Polynomial expansion
+    int     dec_a = (sqmul_qs8(a, inv_ln2, fixed_point_position) >> fixed_point_position);
+    qint8_t alpha = sabs_qs8(sqsub_qs8(a, sqmul_qs8(ln2, sqshl_qs8(dec_a, fixed_point_position), fixed_point_position)));
+    qint8_t sum   = sqadd_qs8(sqmul_qs8(alpha, D, fixed_point_position), C);
+    sum           = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), B);
+    sum           = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), A);
+    sum           = sqmul_qs8(alpha, sum, fixed_point_position);
+    sum           = sqadd_qs8(sum, const_one);
+
+    return (dec_a < 0) ? (sum >> -dec_a) : sqshl_qs8(sum, dec_a);
+}
+
+inline qint8_t slog_qs8(qint8_t a, int fixed_point_position)
+{
+    // Constants
+    qint8_t const_one = (1 << fixed_point_position);
+    qint8_t ln2       = (0x58 >> (7 - fixed_point_position));
+    qint8_t A         = (0x5C >> (7 - fixed_point_position - 1));
+    qint8_t B         = -(0x56 >> (7 - fixed_point_position));
+    qint8_t C         = (0x29 >> (7 - fixed_point_position));
+    qint8_t D         = -(0x0A >> (7 - fixed_point_position));
+
+    if((const_one == a) || (a < 0))
+    {
+        return 0;
+    }
+    else if(a < const_one)
+    {
+        return -slog_qs8(sdiv_qs8(const_one, a, fixed_point_position), fixed_point_position);
+    }
+
+    // Remove even powers of 2
+    qint8_t shift_val = 31 - __builtin_clz(a >> fixed_point_position);
+    a >>= shift_val;
+    a = ssub_qs8(a, const_one);
+
+    // Polynomial expansion
+    auto sum = sqadd_qs8(sqmul_qs8(a, D, fixed_point_position), C);
+    sum      = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), B);
+    sum      = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), A);
+    sum      = sqmul_qs8(a, sum, fixed_point_position);
+
+    return smul_qs8(sadd_qs8(sum, shift_val << fixed_point_position), ln2, fixed_point_position);
+}
+
+inline float scvt_f32_qs8(qint8_t a, int fixed_point_position)
+{
+    return static_cast<float>(a) / (1 << fixed_point_position);
+}
+
+inline qint8_t scvt_qs8_f32(float a, int fixed_point_position)
+{
+    // round_nearest_integer(a * 2^(fixed_point_position))
+    return static_cast<qint8_t>(static_cast<float>(a) * (1 << fixed_point_position) + 0.5f);
+}
+
+inline float scvt_f32_qs16(qint16_t a, int fixed_point_position)
+{
+    return static_cast<float>(a) / (1 << fixed_point_position);
+}
+
+inline qint8_t scvt_qs16_f32(float a, int fixed_point_position)
+{
+    // round_nearest_integer(a * 2^(fixed_point_position))
+    return static_cast<qint16_t>(static_cast<float>(a) * (1 << fixed_point_position) + 0.5f);
+}
+
+inline qint8_t sqmovn_qs16(qint16_t a)
+{
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(a);
+}
+}
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index a84ce2c..07318ea 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -24,13 +24,14 @@
 #ifndef __ARM_COMPUTE_HELPERS_H__
 #define __ARM_COMPUTE_HELPERS_H__
 
+#include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Steps.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
-
 #include <array>
 #include <cstddef>
 #include <cstdint>
@@ -43,11 +44,10 @@
 {
 class IKernel;
 class ITensor;
-class TensorInfo;
+class ITensorInfo;
 
 namespace cpp14
 {
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
 template <class T>
 struct _Unique_if
 {
@@ -84,12 +84,43 @@
 template <class T, class... Args>
 typename _Unique_if<T>::_Known_bound
 make_unique(Args &&...) = delete;
-#endif /* DOXYGEN_SKIP_THIS */
-}
 }
 
-namespace
+template <typename T>
+struct enable_bitwise_ops
 {
+    static constexpr bool value = false;
+};
+
+template <typename T>
+typename std::enable_if<enable_bitwise_ops<T>::value, T>::type operator&(T lhs, T rhs)
+{
+    using underlying_type = typename std::underlying_type<T>::type;
+    return static_cast<T>(static_cast<underlying_type>(lhs) & static_cast<underlying_type>(rhs));
+}
+
+namespace traits
+{
+/** Check if a type T is contained in a tuple Tuple of types */
+template <typename T, typename Tuple>
+struct is_contained;
+
+template <typename T>
+struct is_contained<T, std::tuple<>> : std::false_type
+{
+};
+
+template <typename T, typename... Ts>
+struct is_contained<T, std::tuple<T, Ts...>> : std::true_type
+{
+};
+
+template <typename T, typename U, typename... Ts>
+struct is_contained<T, std::tuple<U, Ts...>> : is_contained<T, std::tuple<Ts...>>
+{
+};
+}
+
 /** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
  * the real coordinates and the smallest following integer coordinates.
  *
@@ -215,10 +246,7 @@
 {
     return foldl(std::forward<F>(func), func(std::forward<I>(initial), std::forward<T>(value)), std::forward<Vs>(values)...);
 }
-}
 
-namespace arm_compute
-{
 /** Iterator updated by @ref execute_window_loop for each window element */
 class Iterator
 {
@@ -334,7 +362,7 @@
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window(const TensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
 
 /** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
  *
@@ -345,7 +373,17 @@
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window_horizontal(const TensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+
+/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] border_size (Optional) Border size. The border region will be included in the window.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
 
 /** Intersect multiple valid regions.
  *
@@ -386,7 +424,7 @@
  *         calculated based on the tensor shape and the strides of lower dimensions.
  */
 template <typename T, typename... Ts>
-inline Strides compute_strides(const TensorInfo &info, T stride_x, Ts &&... fixed_strides)
+inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
 {
     const TensorShape &shape = info.tensor_shape();
 
@@ -408,11 +446,62 @@
  * @return Strides object based on element size and tensor shape.
  */
 template <typename... Ts>
-inline Strides compute_strides(const TensorInfo &info)
+inline Strides compute_strides(const ITensorInfo &info)
 {
     return compute_strides(info, info.element_size());
 }
-}
+
+/* Auto initialize the tensor info (shape, number of channels, data type and fixed point position) if the current assignment is empty.
+ *
+ * @param[in,out] info                 Tensor info used to check and assign.
+ * @param[in]     shape                New shape.
+ * @param[in]     num_channels         New number of channels.
+ * @param[in]     data_type            New data type
+ * @param[in]     fixed_point_position New fixed point position
+ *
+ * @return True if the tensor info has been initialized
+ */
+bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, int fixed_point_position);
+
+/* Set the shape to the specified value if the current assignment is empty.
+ *
+ * @param[in,out] info  Tensor info used to check and assign.
+ * @param[in]     shape New shape.
+ *
+ * @return True if the shape has been changed.
+ */
+bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape);
+
+/* Set the format, data type and number of channels to the specified value if
+ * the current data type is unknown.
+ *
+ * @param[in,out] info   Tensor info used to check and assign.
+ * @param[in]     format New format.
+ *
+ * @return True if the format has been changed.
+ */
+bool set_format_if_unknown(ITensorInfo &info, Format format);
+
+/* Set the data type and number of channels to the specified value if
+ * the current data type is unknown.
+ *
+ * @param[in,out] info      Tensor info used to check and assign.
+ * @param[in]     data_type New data type.
+ *
+ * @return True if the data type has been changed.
+ */
+bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type);
+
+/* Set the fixed point position to the specified value if
+ * the current fixed point position is 0 and the data type is QS8 or QS16
+ *
+ * @param[in,out] info                 Tensor info used to check and assign.
+ * @param[in]     fixed_point_position New fixed point position
+ *
+ * @return True if the fixed point position has been changed.
+ */
+bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position);
+} // namespace arm_compute
 
 #include "arm_compute/core/Helpers.inl"
 #endif /*__ARM_COMPUTE_HELPERS_H__ */
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index 4aa7acf..f885810 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -27,7 +27,7 @@
 #include <cmath>
 #include <numeric>
 
-namespace
+namespace arm_compute
 {
 inline uint8_t delta_bilinear_c1u8(const uint8_t *pixel_ptr, size_t stride, float dx, float dy)
 {
@@ -122,11 +122,7 @@
     // Return average
     return sum / (x_elements * y_elements);
 }
-}
 
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
-namespace arm_compute
-{
 template <size_t dimension>
 struct IncrementIterators
 {
@@ -143,6 +139,11 @@
         it.increment(dimension);
         // End of recursion
     }
+
+    static void unroll()
+    {
+        // End of recursion
+    }
 };
 
 template <size_t dim>
@@ -189,7 +190,7 @@
     : Iterator()
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
-    const TensorInfo *info = tensor->info();
+    const ITensorInfo *info = tensor->info();
     ARM_COMPUTE_ERROR_ON(info == nullptr);
     const Strides &strides = info->strides_in_bytes();
 
@@ -244,5 +245,62 @@
         _dims[n]._dim_start = _dims[dimension]._dim_start;
     }
 }
+
+inline bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, int fixed_point_position)
+{
+    if(info.tensor_shape().total_size() == 0)
+    {
+        info.set_data_type(data_type);
+        info.set_tensor_shape(shape);
+        info.set_num_channels(num_channels);
+        info.set_fixed_point_position(fixed_point_position);
+        return true;
+    }
+
+    return false;
 }
-#endif /* DOXYGEN_SKIP_THIS */
+
+inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
+{
+    if(info.tensor_shape().total_size() == 0)
+    {
+        info.set_tensor_shape(shape);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool set_format_if_unknown(ITensorInfo &info, Format format)
+{
+    if(info.data_type() == DataType::UNKNOWN)
+    {
+        info.set_format(format);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
+{
+    if(info.data_type() == DataType::UNKNOWN)
+    {
+        info.set_data_type(data_type);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position)
+{
+    if(info.fixed_point_position() == 0 && (info.data_type() == DataType::QS8 || info.data_type() == DataType::QS16))
+    {
+        info.set_fixed_point_position(fixed_point_position);
+        return true;
+    }
+
+    return false;
+}
+} // namespace arm_compute
diff --git a/arm_compute/core/IAccessWindow.h b/arm_compute/core/IAccessWindow.h
index 3b905ed..cf7490d 100644
--- a/arm_compute/core/IAccessWindow.h
+++ b/arm_compute/core/IAccessWindow.h
@@ -33,7 +33,7 @@
 namespace arm_compute
 {
 class Window;
-class TensorInfo;
+class ITensorInfo;
 
 /** Decrease @p required in steps of @p step until it's less than @p available.
  *
@@ -112,7 +112,7 @@
      * @param[in]     width  Number of elements that are accessed in X direction.
      * @param[in]     height Number of elements that are accessed in Y direction.
      */
-    AccessWindowRectangle(TensorInfo *info, int x, int y, int width, int height)
+    AccessWindowRectangle(ITensorInfo *info, int x, int y, int width, int height)
         : AccessWindowRectangle(info, x, y, width, height, 1.f, 1.f)
     {
     }
@@ -129,7 +129,7 @@
      * @param[in]     scale_x Ratio along the X direction between the window used by the execute_window_loop and the rectangular access pattern defined
      * @param[in]     scale_y Ratio along the Y direction between the window used by the execute_window_loop and the rectangular access pattern defined
      */
-    AccessWindowRectangle(TensorInfo *info, int x, int y, int width, int height, float scale_x, float scale_y)
+    AccessWindowRectangle(ITensorInfo *info, int x, int y, int width, int height, float scale_x, float scale_y)
         : _info(info), _x(x), _y(y), _width(width), _height(height), _scale_x(scale_x), _scale_y(scale_y)
     {
         ARM_COMPUTE_ERROR_ON(width < 0);
@@ -171,13 +171,13 @@
     bool update_padding_if_needed(const Window &window) const override;
 
 protected:
-    TensorInfo *_info;
-    int         _x;
-    int         _y;
-    int         _width;
-    int         _height;
-    float       _scale_x;
-    float       _scale_y;
+    ITensorInfo *_info;
+    int          _x;
+    int          _y;
+    int          _width;
+    int          _height;
+    float        _scale_x;
+    float        _scale_y;
 };
 
 /** Implementation of a column access pattern. */
@@ -193,7 +193,7 @@
      * @param[in]     height  Number of elements that are accessed in Y direction.
      * @param[in]     scale_y Ratio along the Y direction between the window used by the execute_window_loop and the rectangular access pattern defined
      */
-    AccessWindowVertical(TensorInfo *info, int y, int height, float scale_y = 1.f)
+    AccessWindowVertical(ITensorInfo *info, int y, int height, float scale_y = 1.f)
         : AccessWindowRectangle(info, 0, y, 1, height, 1.f, scale_y)
     {
         ARM_COMPUTE_ERROR_ON(height < 0);
@@ -214,7 +214,7 @@
      * @param[in]     width   Number of elements that are accessed in X direction.
      * @param[in]     scale_x Ratio along the X direction between the window used by the execute_window_loop and the rectangular access pattern defined
      */
-    AccessWindowHorizontal(TensorInfo *info, int x, int width, float scale_x = 1.f)
+    AccessWindowHorizontal(ITensorInfo *info, int x, int width, float scale_x = 1.f)
         : AccessWindowRectangle(info, x, 0, width, 1, scale_x, 1.f)
     {
         ARM_COMPUTE_ERROR_ON(width < 0);
diff --git a/arm_compute/core/ITensor.h b/arm_compute/core/ITensor.h
index 55464a7..202b50a 100644
--- a/arm_compute/core/ITensor.h
+++ b/arm_compute/core/ITensor.h
@@ -40,12 +40,12 @@
      *
      * @return A pointer to the tensor's metadata.
      */
-    virtual TensorInfo *info() const = 0;
+    virtual ITensorInfo *info() const = 0;
     /** Interface to be implemented by the child class to return the tensor's metadata
      *
      * @return A pointer to the tensor's metadata.
      */
-    virtual TensorInfo *info() = 0;
+    virtual ITensorInfo *info() = 0;
     /** Default virtual destructor */
     virtual ~ITensor() = default;
     /** Interface to be implemented by the child class to return a pointer to CPU memory
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
new file mode 100644
index 0000000..bb3ac6e
--- /dev/null
+++ b/arm_compute/core/ITensorInfo.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ITENSORINFO_H__
+#define __ARM_COMPUTE_ITENSORINFO_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Store the tensor's metadata */
+class ITensorInfo
+{
+public:
+    /** Default virtual destructor */
+    virtual ~ITensorInfo() = default;
+    /** Set the data type to the specified value.
+     *
+     * @warning This resets the format to UNKNOWN.
+     *
+     * @param[in] data_type The new data type.
+     */
+    virtual void set_data_type(DataType data_type) = 0;
+    /** Set the number of channels to the specified value.
+     *
+     * @warning This resets the format to UNKNOWN.
+     *
+     * @param[in] num_channels New number of channels.
+     */
+    virtual void set_num_channels(int num_channels) = 0;
+    /** Set the format of an already initialized tensor.
+     *
+     * @note If the data type has already been configured (i.e. not UNKNOWN) it
+     * must match the new format. If data type hasn't been configured it will
+     * be based on the format.
+     *
+     * @param[in] format Single-plane format of the tensor.
+     */
+    virtual void set_format(Format format) = 0;
+    /** Set the shape of an already initialized tensor.
+     *
+     * @warning Changing the shape requires to recompute the strides and is
+     * therefore only possible if the tensor hasn't been allocated yet.
+     *
+     * @param[in] shape New tensor shape.
+     */
+    virtual void set_tensor_shape(TensorShape shape) = 0;
+    /** Set the fixed point position to the specified value
+     *
+     * @warning The fixed point position must be set once the data type has been configured
+     *
+     * @param[in] fixed_point_position The new fixed point position
+     */
+    virtual void set_fixed_point_position(int fixed_point_position) = 0;
+    /** Update the offset to the first element and the strides to automatically computed values.
+     *
+     * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
+     *
+     * @return True if the strides or the offset to the first element have changed.
+     */
+    virtual bool auto_padding() = 0;
+    /** Update the offset to the first element, the strides and the total size.
+     *
+     * @note This function can only increase the offset, strides and total size.
+     *
+     * @param[in] padding Padding around the XY plane in number of elements.
+     *
+     * @return True if the strides, offset and total size have changed.
+     */
+    virtual bool extend_padding(const PaddingSize &padding) = 0;
+    /** Return the size of the requested dimension
+     *
+     * @param[in] index Index of the dimension
+     *
+     * @return Dimension of the requested dimension
+     */
+    virtual size_t dimension(size_t index) const = 0;
+    /** The strides in bytes for accessing each dimension of the tensor
+     *
+     * @return Strides in bytes for each tensor dimension
+     */
+    virtual const Strides &strides_in_bytes() const = 0;
+    /** The offset from the beginning of the memory allocation to the first element of the tensor.
+     *  This can be used to access efficiently elements in a 2D tensor
+     *
+     * @return The offset in bytes to access the first element of the tensor.
+     */
+    virtual size_t offset_first_element_in_bytes() const = 0;
+    /** The offset in bytes from the beginning of the memory allocation to access the element at position (x, y, z ...)
+     *
+     * @param[in] pos Vector with the coordinates of the element to access.
+     *                The size of this vector must be equal to the number of dimensions of the tensor
+     *
+     * @return Offset in bytes from the beginning of the memory allocation to access the element (x, y, z, ...)
+     */
+    virtual size_t offset_element_in_bytes(const Coordinates &pos) const = 0;
+    /** Fixed point position used when the tensor data type is QS8 or QS16
+     *
+     * @return The fixed point position that expresses the number of bits for the fractional part of the number
+     */
+    virtual int fixed_point_position() const = 0;
+    /** Element size in bytes calculated as data_size() * num_channels()
+     *
+     * @return The size of one element in bytes
+     */
+    virtual size_t element_size() const = 0;
+    /** The number of dimensions of the tensor (rank)
+     *
+     * @return The number of dimensions of the tensor (rank)
+     */
+    virtual size_t num_dimensions() const = 0;
+    /** The number of channels for each tensor element
+     *
+     * @return The number of channels for each tensor element
+     */
+    virtual size_t num_channels() const = 0;
+    /** Size for each dimension of the tensor
+     *
+     * @return A vector with the size for each dimension of the tensor
+     */
+    virtual const TensorShape &tensor_shape() const = 0;
+    /** Data type used for each element of the tensor
+     *
+     * @return Tensor data type
+     */
+    virtual DataType data_type() const = 0;
+    /** Colour format of the image
+     *
+     * @return Colour format of the image
+     */
+    virtual Format format() const = 0;
+    /** Returns the total size of the tensor in bytes.
+     *
+     * @return Total size of the tensor in bytes.
+     */
+    virtual size_t total_size() const = 0;
+    /** Padding of tensor.
+     *
+     * @return Padding.
+     */
+    virtual PaddingSize padding() const = 0;
+    /** Checks if the tensor has been allocated with padding or not.
+     *
+     * @return True if padding is allocated in the tensor, otherwise false.
+     */
+    virtual bool has_padding() const = 0;
+    /** Flag indicating whether the size of the tensor can be changed.
+     *
+     * @return True if the tensor size can be changed.
+     */
+    virtual bool is_resizable() const = 0;
+    /** Set the flag whether the tensor size can be changed.
+     *
+     * @param[in] is_resizable Flag that marks the tensor if it can be changed or not.
+     */
+    virtual void set_is_resizable(bool is_resizable) = 0;
+    /** Valid region of the tensor. All elements in the valid region have defined values, i.e. are not undefined.
+     *
+     * @return The valid region.
+     */
+    virtual ValidRegion valid_region() const = 0;
+    /** Set the valid region of the tensor.
+     *
+     * @param[in] valid_region Valid region to set.
+     */
+    virtual void set_valid_region(ValidRegion valid_region) = 0;
+};
+}
+#endif /*__ARM_COMPUTE_TENSORINFO_H__ */
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
new file mode 100644
index 0000000..fb71261
--- /dev/null
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFIXEDPOINT_H__
+#define __ARM_COMPUTE_NEFIXEDPOINT_H__
+
+#include "arm_compute/core/FixedPoint.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+using qint8x8_t    = int8x8_t;    /**< 8 bit fixed point vector with 8 elements */
+using qint8x8x2_t  = int8x8x2_t;  /**< 8 bit fixed point vector with 16 elements */
+using qint8x8x3_t  = int8x8x3_t;  /**< 8 bit fixed point vector with 24 elements */
+using qint8x8x4_t  = int8x8x4_t;  /**< 8 bit fixed point vector with 32 elements */
+using qint8x16_t   = int8x16_t;   /**< 8 bit fixed point vector with 16 elements */
+using qint8x16x2_t = int8x16x2_t; /**< 8 bit fixed point vector with 32 elements */
+using qint8x16x3_t = int8x16x3_t; /**< 8 bit fixed point vector with 48 elements */
+using qint8x16x4_t = int8x16x4_t; /**< 8 bit fixed point vector with 64 elements */
+using qint16x4_t   = int16x4_t;   /**< 16 bit fixed point vector with 4 elements */
+using qint16x4x2_t = int16x4x2_t; /**< 16 bit fixed point vector with 8 elements */
+using qint16x4x3_t = int16x4x3_t; /**< 16 bit fixed point vector with 12 elements */
+using qint16x4x4_t = int16x4x4_t; /**< 16 bit fixed point vector with 16 elements */
+using qint16x8_t   = int16x8_t;   /**< 16 bit fixed point vector with 8 elements */
+using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */
+using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */
+using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
+
+/** Get the lower half of a 16 elements vector
+ *
+ * @param[in] a vector of 16 elements
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vget_low_qs8(qint8x16_t a);
+
+/** Get the higher half of a 16 elements vector
+ *
+ * @param[in] a vector of 16 elements
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vget_high_qs8(qint8x16_t a);
+
+/** Load a single 8 bit fixed point vector from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point vector to load
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vld1_qs8(const qint8_t *addr);
+
+/** Load a single 8 bit fixed point vector from memory (16 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point vector to load
+ *
+ * @return 8 bit fixed point vector (16 elements)
+ */
+qint8x16_t vld1q_qs8(const qint8_t *addr);
+
+/** Load a single 16 bit fixed point vector from memory (4 elements)
+ *
+ * @param[in] addr Memory address of the 16 bit fixed point vector to load
+ *
+ * @return 16 bit fixed point vector (4 elements)
+ */
+qint16x4_t vld1_qs16(const qint16_t *addr);
+
+/** Load a single 16 bit fixed point vector from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 16 bit fixed point vector to load
+ *
+ * @return 16 bit fixed point vector (8 elements)
+ */
+qint16x8_t vld1q_qs16(const qint16_t *addr);
+
+/** Load all lanes of 8 bit fixed point vector with same value from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vld1_dup_qs8(const qint8_t *addr);
+
+/** Load all lanes of 8 bit fixed point vector with same value from memory (16 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
+ *
+ * @return 8 bit fixed point vector (16 elements)
+ */
+qint8x16_t vld1q_dup_qs8(const qint8_t *addr);
+
+/** Store a single 8 bit fixed point vector to memory (8 elements)
+ *
+ * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
+ * @param[in] b    8 bit fixed point vector to store
+ *
+ */
+void vst1_qs8(qint8_t *addr, qint8x8_t b);
+
+/** Store a single 8 bit fixed point vector to memory (16 elements)
+ *
+ * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
+ * @param[in] b    8 bit fixed point vector to store
+ *
+ */
+void vst1q_qs8(qint8_t *addr, qint8x16_t b);
+
+/** Store a single 16 bit fixed point vector to memory (4 elements)
+ *
+ * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
+ * @param[in] b    16 bit fixed point vector to store
+ *
+ */
+void vst1_qs16(qint16_t *addr, qint16x4_t b);
+
+/** Store a single 8 bit fixed point vector to memory (16 elements)
+ *
+ * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
+ * @param[in] b    16 bit fixed point vector to store
+ *
+ */
+void vst1q_qs16(qint16_t *addr, qint16x8_t b);
+
+/** 16 bit fixed point vector saturating narrow (8 elements)
+ *
+ * @param[in] a 16 bit fixed point vector to convert
+ *
+ * @return 8 bit fixed point vector
+ */
+qint8x8_t vqmovn_q16(qint16x8_t a);
+
+/** 8 bit fixed point vector duplicate (8 elements)
+ *
+ * @param[in] a 8 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint8x8_t vdup_n_qs8(qint8_t a);
+
+/** 8 bit fixed point vector duplicate (16 elements)
+ *
+ * @param[in] a 8 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint8x16_t vdupq_n_qs8(qint8_t a);
+
+/** Duplicate a float and convert it to 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a                    8 bit fixed point to duplicate
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the vector duplication
+ */
+qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position);
+
+/** 16 bit fixed point vector duplicate (8 elements)
+ *
+ * @param[in] a 16 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint16x8_t vdupq_n_qs16(qint16x8_t a);
+
+/** Absolute value of 8 bit fixed point vector (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x8_t vabs_qs8(qint8x8_t a);
+
+/** Absolute value of 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x16_t vabsq_qs8(qint8x16_t a);
+
+/** Saturating absolute value of 8 bit fixed point vector (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x8_t vqabs_qs8(qint8x8_t a);
+
+/** Saturating absolute value of 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x16_t vqabsq_qs8(qint8x16_t a);
+
+/** 8 bit fixed point vector max (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector max (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector pairwise max (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector pairwise max operation
+ */
+qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector min (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector min (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector min operation
+ */
+qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector pairwise min (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector pairwise min operation
+ */
+qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector add (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition
+ */
+qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector add (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition
+ */
+qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector saturating add (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector saturating add (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 16 bit fixed point vector saturating add (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b);
+
+/** 16 bit fixed point vector saturating add (8 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b);
+
+/** 8 bit fixed point vector saturating pairwise add (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+int16x4_t vpaddl_qs8(qint8x8_t a);
+
+/** 8 bit fixed point vector subtraction (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction
+ */
+qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector subtraction (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction
+ */
+qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector saturating subtraction (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
+ */
+qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector saturating subtraction (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
+ */
+qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication.
+ */
+qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply (16 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication.
+ */
+qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
+ */
+qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply (16 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
+ */
+qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector long multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point long vector multiplication.
+ */
+qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate
+ */
+qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate
+ */
+qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow
+ */
+qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow
+ */
+qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate long (8 elements).
+ *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate long
+ */
+qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate long (8 elements). The saturation is performed on the 16 bit fixed point output vector.
+ *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate long
+ */
+qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** Convert a float vector with 4x2 elements to 8 bit fixed point vector with 8 elements
+ *
+ * @param[in] a                    Float input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 8 bit fixed point
+ */
+qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position);
+
+/** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements
+ *
+ * @param[in] a                    Float input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 8 bit fixed point
+ */
+qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position);
+
+/** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements
+ *
+ * @param[in] a                    8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 8 bit fixed point -> float32x2x4
+ */
+float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Convert a 8 bit fixed point vector with 16 elements to a float vector with 4x4 elements
+ *
+ * @param[in] a                    8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 8 bit fixed point -> float32x4x4
+ */
+float32x4x4_t vcvtq_qs8_f32(qint8x16_t a, int fixed_point_position);
+
+/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit reciprocal (1/a).
+ */
+qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit reciprocal (1/a).
+ */
+qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Division fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    First 8bit fixed point input vector
+ * @param[in] b                    Second 8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The quotient and remainder number in fixed point format.
+ */
+qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position);
+
+/** Division fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    First 8bit fixed point input vector
+ * @param[in] b                    Second 8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The quotient and remainder number in 8bit fixed point format.
+ */
+qint8x16_t vdivq_qs8(qint8x16_t a, int8x16_t b, int fixed_point_position);
+
+/** Perform a 4th degree polynomial approximation. (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit taylor approximation.
+ */
+template <bool islog>
+qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Perform a 4th degree polynomial approximation. (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit taylor approximation.
+ */
+template <bool islog>
+qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating exponential fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit saturating exponential
+ */
+qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate saturating exponential fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit saturating exponential
+ */
+qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate logarithm fixed point 16bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit logarithm.
+ */
+qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate logarithm fixed point 16bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit logarithm.
+ */
+qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate hyperbolic tangent for fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate hyperbolic tangent for fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating n power for fixed point 8bit (16 elements).
+ *
+ * pow(a,b) = e^(b*log(a))
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] b                    8bit fixed point power vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit power.
+ */
+qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position);
+}
+#include "arm_compute/core/NEON/NEFixedPoint.inl"
+#endif /* __ARM_COMPUTE_NEFIXEDPOINT_H__ */
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
new file mode 100644
index 0000000..6db344d
--- /dev/null
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -0,0 +1,1018 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+namespace arm_compute
+{
+/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
+ *  Format is in Q0.7 for all elements */
+const std::array<qint8x8_t, 4> exp_tab_qs8 =
+{
+    {
+        vdup_n_s8(0x7F), // 0.9978546
+        vdup_n_s8(0x3F), // 0.4994721
+        vdup_n_s8(0x16), // 0.1763723
+        vdup_n_s8(0x05), // 0.0435108
+    }
+};
+
+/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
+ * Format is in Q0.7 for all elements */
+const std::array<qint8x16_t, 4> exp_tabq_qs8 =
+{
+    {
+        vdupq_n_s8(0x7F), // 0.9978546
+        vdupq_n_s8(0x3F), // 0.4994721
+        vdupq_n_s8(0x16), // 0.1763723
+        vdupq_n_s8(0x05), // 0.0435108
+    }
+};
+
+/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
+ * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+const std::array<qint8x8_t, 4> log_tab_qs8 =
+{
+    {
+        vdup_n_s8(0x5C),  // 1.4384189
+        vdup_n_s8(-0x56), // -0.6771900
+        vdup_n_s8(0x29),  // 0.3218538
+        vdup_n_s8(-0x0A), // -0.0832229
+    }
+};
+
+/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
+ * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+const std::array<qint8x16_t, 4> log_tabq_qs8 =
+{
+    {
+        vdupq_n_s8(0x5C),  // 1.4384189
+        vdupq_n_s8(-0x56), // -0.6771900
+        vdupq_n_s8(0x29),  // 0.3218538
+        vdupq_n_s8(-0x0A), // -0.0832229
+    }
+};
+
+inline qint8x8_t vget_low_qs8(qint8x16_t a)
+{
+    return vget_low_s8(a);
+}
+
+inline qint8x8_t vget_high_qs8(qint8x16_t a)
+{
+    return vget_high_s8(a);
+}
+
+inline qint8x8_t vld1_qs8(const qint8_t *addr)
+{
+    return vld1_s8(addr);
+}
+
+inline qint8x16_t vld1q_qs8(const qint8_t *addr)
+{
+    return vld1q_s8(addr);
+}
+
+inline qint16x4_t vld1_qs16(const qint16_t *addr)
+{
+    return vld1_s16(addr);
+}
+
+inline qint16x8_t vld1q_qs16(const qint16_t *addr)
+{
+    return vld1q_s16(addr);
+}
+
+inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
+{
+    return vld1_dup_s8(addr);
+}
+
+inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
+{
+    return vld1q_dup_s8(addr);
+}
+
+inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
+{
+    vst1_s8(addr, b);
+}
+
+inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
+{
+    vst1q_s8(addr, b);
+}
+
+inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
+{
+    vst1_s16(addr, b);
+}
+
+inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
+{
+    vst1q_s16(addr, b);
+}
+
+inline qint8x8_t vqmovn_qs16(qint16x8_t a)
+{
+    return vqmovn_s16(a);
+}
+
+inline qint8x8_t vdup_n_qs8(qint8_t a)
+{
+    return vdup_n_s8(a);
+}
+
+inline qint8x16_t vdupq_n_qs8(qint8_t a)
+{
+    return vdupq_n_s8(a);
+}
+
+inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
+{
+    float32x4x4_t res =
+    {
+        {
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+        }
+    };
+    return vcvtq_qs8_f32(res, fixed_point_position);
+}
+
+inline qint16x8_t vdupq_n_qs16(qint16_t a)
+{
+    return vdupq_n_s16(a);
+}
+
+inline qint8x8_t vabs_qs8(qint8x8_t a)
+{
+    return vabs_s8(a);
+}
+
+inline qint8x16_t vabsq_qs8(qint8x16_t a)
+{
+    return vabsq_s8(a);
+}
+
+inline qint8x8_t vqabs_qs8(qint8x8_t a)
+{
+    return vqabs_s8(a);
+}
+
+inline qint8x16_t vqabsq_qs8(qint8x16_t a)
+{
+    return vqabsq_s8(a);
+}
+
+inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vmax_s8(a, b);
+}
+
+inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vmaxq_s8(a, b);
+}
+
+inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vpmax_s8(a, b);
+}
+
+inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vmin_s8(a, b);
+}
+
+inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vminq_s8(a, b);
+}
+
+inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vpmin_s8(a, b);
+}
+
+inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vadd_s8(a, b);
+}
+
+inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vaddq_s8(a, b);
+}
+
+inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vqadd_s8(a, b);
+}
+
+inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vqaddq_s8(a, b);
+}
+
+inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
+{
+    return vqadd_s16(a, b);
+}
+
+inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
+{
+    return vqaddq_s16(a, b);
+}
+
+inline int16x4_t vpaddl_qs8(qint8x8_t a)
+{
+    return vpaddl_s8(a);
+}
+
+inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vsub_s8(a, b);
+}
+
+inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vsubq_s8(a, b);
+}
+
+inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vqsub_s8(a, b);
+}
+
+inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vqsubq_s8(a, b);
+}
+
+inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary result with a constant used to round up the result
+    qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    res = vmlal_s8(res, a, b);
+
+    // Shift right by fixed_point_position
+    res = vshlq_s16(res, fixed_point_position_s16);
+
+    // Convert back to qint8
+    return vmovn_s16(res);
+}
+
+inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t res1 = res0;
+
+    // Vector multiply-accumulate long
+    res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
+    res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
+
+    // Shift right by fixed_point_position
+    res0 = vshlq_s16(res0, fixed_point_position_s16);
+    res1 = vshlq_s16(res1, fixed_point_position_s16);
+
+    // Convert back to qint8
+    return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
+}
+
+inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary result with a constant used to round up the result
+    qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    res = vmlal_s8(res, a, b);
+
+    // Shift right by fixed_point_position
+    res = vqshlq_s16(res, fixed_point_position_s16);
+
+    // Convert back to qint8 and saturate
+    return vqmovn_s16(res);
+}
+
+inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t res1 = res0;
+
+    // Vector multiply-accumulate long
+    res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
+    res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
+
+    // Shift right by fixed_point_position
+    res0 = vqshlq_s16(res0, fixed_point_position_s16);
+    res1 = vqshlq_s16(res1, fixed_point_position_s16);
+
+    // Convert back to qint8 and saturate
+    return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
+}
+
+inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    qint16x8_t res = vmull_s8(a, b);
+
+    return vqrshlq_s16(res, fixed_point_position_s16);
+}
+
+inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vshlq_s16(tmp, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vadd_s8(a, vmovn_s16(tmp));
+}
+
+inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t tmp1 = tmp0;
+
+    // Vector multiply-accumulate long
+    tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
+    tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
+
+    // Shift right by fixed_point_position
+    tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
+    tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
+}
+
+inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vqshlq_s16(tmp, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vqadd_s8(a, vqmovn_s16(tmp));
+}
+
+inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t tmp1 = tmp0;
+
+    // Vector multiply-accumulate long
+    tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
+    tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
+
+    // Shift right by fixed_point_position
+    tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
+    tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
+    return vqaddq_s8(a, res);
+}
+
+inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vshlq_s16(tmp, fixed_point_position_s16);
+
+    // Accumulate
+    return vaddq_s16(a, tmp);
+}
+
+inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vqshlq_s16(tmp, fixed_point_position_s16);
+
+    // Accumulate
+    return vqaddq_s16(a, tmp);
+}
+
+inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
+
+    float32x4x2_t res_f32 =
+    {
+        {
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f)
+        }
+    };
+
+    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
+    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
+
+    const int32x4x2_t res_s32 =
+    {
+        {
+            vcvtq_s32_f32(res_f32.val[0]),
+            vcvtq_s32_f32(res_f32.val[1]),
+        }
+    };
+
+    const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
+
+    return vqmovn_s16(res_s16);
+}
+
+inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
+
+    float32x4x4_t res_f32 =
+    {
+        {
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f)
+        }
+    };
+
+    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
+    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
+    res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
+    res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
+
+    const int32x4x4_t res_s32 =
+    {
+        {
+            vcvtq_s32_f32(res_f32.val[0]),
+            vcvtq_s32_f32(res_f32.val[1]),
+            vcvtq_s32_f32(res_f32.val[2]),
+            vcvtq_s32_f32(res_f32.val[3]),
+        }
+    };
+
+    const int16x8x2_t res_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
+            vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
+        }
+    };
+
+    return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
+}
+
+inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
+
+    const int16x8_t res_s16 = vmovl_s8(a);
+
+    const int32x4x2_t res_s32 =
+    {
+        {
+            vmovl_s16(vget_low_s16(res_s16)),
+            vmovl_s16(vget_high_s16(res_s16))
+        }
+    };
+
+    float32x4x2_t res_f32 =
+    {
+        {
+            vcvtq_f32_s32(res_s32.val[0]),
+            vcvtq_f32_s32(res_s32.val[1])
+        }
+    };
+
+    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
+    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
+
+    return res_f32;
+}
+
+inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
+
+    const int16x8x2_t res_s16 =
+    {
+        {
+            vmovl_s8(vget_low_s8(a)),
+            vmovl_s8(vget_high_s8(a)),
+        }
+    };
+
+    const int32x4x4_t res_s32 =
+    {
+        {
+            vmovl_s16(vget_low_s16(res_s16.val[0])),
+            vmovl_s16(vget_high_s16(res_s16.val[0])),
+            vmovl_s16(vget_low_s16(res_s16.val[1])),
+            vmovl_s16(vget_high_s16(res_s16.val[1])),
+        }
+    };
+
+    float32x4x4_t res_f32 =
+    {
+        {
+            vcvtq_f32_s32(res_s32.val[0]),
+            vcvtq_f32_s32(res_s32.val[1]),
+            vcvtq_f32_s32(res_s32.val[2]),
+            vcvtq_f32_s32(res_s32.val[3])
+        }
+    };
+
+    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
+    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
+    res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
+    res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
+
+    return res_f32;
+}
+
+inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x8_t const_48_over_17       = vdup_n_s8(0x7A >> (5 - fixed_point_position));    // 2.823
+    const qint8x8_t const_minus_32_over_17 = vdup_n_s8(-(0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x8_t const_one              = vdup_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+    const qint8x8_t temp        = vshl_s8(a, shift_value);
+
+    qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    uint8x8_t set_one = vcgt_s8(x, const_one);
+    x                 = vbsl_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vshl_s8(x, shift_value);
+}
+
+inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x16_t const_48_over_17       = vdupq_n_s8(0x7A >> (5 - fixed_point_position));   // 2.823
+    const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x16_t const_one              = vdupq_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+    const qint8x16_t temp        = vshlq_s8(a, shift_value);
+
+    qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    // Set initial guess to one if x > 1
+    uint8x16_t set_one = vcgtq_s8(x, const_one);
+    x                  = vbslq_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vshlq_s8(x, shift_value);
+}
+
+inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x16_t const_48_over_17       = vdupq_n_s8(0x7A >> (5 - fixed_point_position));   // 2.823
+    const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x16_t const_one              = vdupq_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+    const qint8x16_t temp        = vqshlq_s8(a, shift_value);
+
+    qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    // Set initial guess to one if x > 1
+    uint8x16_t set_one = vcgtq_s8(x, const_one);
+    x                  = vbslq_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vqshlq_s8(x, shift_value);
+}
+
+inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
+}
+
+inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
+}
+
+template <bool   islog>
+inline qint8x8_t vtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
+    const qint8x8_t const_one   = vdup_n_s8(1);
+    const qint8x8_t A           = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
+    const qint8x8_t B           = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
+    const qint8x8_t C           = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
+    const qint8x8_t D           = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
+    const qint8x8_t x1          = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
+    const qint8x8_t x2          = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
+    const qint8x8_t x3          = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
+    const qint8x8_t res         = vmul_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool   islog>
+inline qint8x8_t vqtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
+    const qint8x8_t const_one   = vdup_n_s8(1);
+    const qint8x8_t A           = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
+    const qint8x8_t B           = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
+    const qint8x8_t C           = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
+    const qint8x8_t D           = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
+    const qint8x8_t x1          = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
+    const qint8x8_t x2          = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
+    const qint8x8_t x3          = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
+    const qint8x8_t res         = vqmul_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool    islog>
+inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
+    const qint8x16_t const_one   = vdupq_n_s8(1);
+    const qint8x16_t A           = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
+    const qint8x16_t B           = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
+    const qint8x16_t C           = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
+    const qint8x16_t D           = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
+    const qint8x16_t x1          = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
+    const qint8x16_t x2          = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
+    const qint8x16_t x3          = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
+    const qint8x16_t res         = vmulq_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool    islog>
+inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
+    const qint8x16_t const_one   = vdupq_n_s8(1);
+    const qint8x16_t A           = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
+    const qint8x16_t B           = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
+    const qint8x16_t C           = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
+    const qint8x16_t D           = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
+    const qint8x16_t x1          = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
+    const qint8x16_t x2          = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
+    const qint8x16_t x3          = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
+    const qint8x16_t res         = vqmulq_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value   = vdup_n_s8(fixed_point_position - 7);
+    const qint8x8_t const_one     = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_ln2     = vqrshl_s8(vdup_n_s8(0x58), shift_value);                     // ln(2)
+    const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
+
+    // get decimal part from m
+    const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
+
+    qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
+    alpha           = vqabs_qs8(vqsub_s8(a, alpha));
+
+    // Polynomial Approximation
+    qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
+    poly           = vqadd_s8(poly, const_one);
+
+    // Reconstruct
+    poly = vqshl_s8(poly, dec_m);
+
+    return poly;
+}
+
+inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value   = vdupq_n_s8(fixed_point_position - 7);
+    const qint8x16_t const_one     = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_ln2     = vqrshlq_s8(vdupq_n_s8(0x58), shift_value);                      // ln(2)
+    const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
+
+    // get decimal part from m
+    const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
+
+    qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
+    alpha            = vqabsq_qs8(vqsubq_qs8(a, alpha));
+
+    // Polynomial Approximation
+    qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
+    poly            = vqaddq_s8(poly, const_one);
+
+    // Reconstruct
+    poly = vqshlq_s8(poly, dec_m);
+
+    return poly;
+}
+
+inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_one       = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_seven_dec = vdup_n_s8(7);
+    const qint8x8_t const_ln2       = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
+
+    // If 0 < a < 1, calculate log(1/x)
+    uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
+    qint8x8_t recip           = vdup_n_s8(0);
+    recip                     = vbsl_s8(calc_reciprocal, recip, a);
+
+    // Calculate reciprocal
+    recip = vrecip_qs8(recip, fixed_point_position);
+    a     = vbsl_s8(calc_reciprocal, recip, a);
+
+    // Get decimal part of a
+    qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
+    qint8x8_t dec_a       = vshl_s8(a, shift_value); // a >> fixed_point_position
+
+    // Get exponent of 2^n which is equal or less than dec_a
+    shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
+
+    // Get x to range (1, 2]
+    const qint8x8_t shift_value_neg = vneg_s8(shift_value);
+    const qint8x8_t temp            = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
+    const qint8x8_t sum             = vmul_s8(shift_value, const_one);
+
+    // Polynomial Approximation
+    qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
+
+    // Reconstruct
+    poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
+
+    // Set negative value for 0 < a < 1
+    poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
+
+    return poly;
+}
+
+inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_one       = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_seven_dec = vdupq_n_s8(7);
+    const qint8x16_t const_ln2       = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
+
+    // If 0 < a < 1, calculate log(1/x)
+    uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
+    qint8x16_t recip           = vdupq_n_s8(0);
+    recip                      = vbslq_s8(calc_reciprocal, a, recip);
+
+    // Calculate reciprocal
+    recip = vrecipq_qs8(recip, fixed_point_position);
+    a     = vbslq_s8(calc_reciprocal, recip, a);
+
+    // Get decimal part of a
+    qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
+    qint8x16_t dec_a       = vshlq_s8(a, shift_value); // a >> fixed_point_position
+
+    // Get exponent of 2^n which is equal or less than dec_a
+    shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
+
+    // Get x to range (1, 2]
+    const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
+    const qint8x16_t temp            = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
+    const qint8x16_t sum             = vmulq_s8(shift_value, const_one);
+
+    // Polynomial Approximation
+    qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
+
+    // Reconstruct
+    poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
+
+    // Set negative value for 0 < a < 1
+    poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
+
+    return poly;
+}
+
+inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x8_t temp         = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
+    uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
+    temp                   = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
+    qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
+
+    temp = vshl_s8(a, shift_value);
+
+    // Initial guess
+    qint8x8_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshl_s8(x, shift_value2);
+}
+
+inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x8_t temp         = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
+    uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
+    temp                   = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
+    qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
+
+    temp = vshl_s8(a, shift_value);
+
+    // Initial guess
+    qint8x8_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshl_s8(x, shift_value2);
+}
+
+inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x16_t temp         = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
+    uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
+    temp                    = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
+    qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
+
+    temp = vshlq_s8(a, shift_value);
+
+    // Initial guess
+    qint8x16_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshlq_s8(x, shift_value2);
+}
+
+inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x16_t temp         = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
+    uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
+    temp                    = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
+    qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
+
+    temp = vshlq_s8(a, shift_value);
+
+    // Initial guess
+    qint8x16_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshlq_s8(x, shift_value2);
+}
+
+inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
+
+    qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
+    qint8x8_t num   = vqsub_qs8(exp2x, const_one);
+    qint8x8_t den   = vqadd_qs8(exp2x, const_one);
+    qint8x8_t tanh  = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
+
+    return tanh;
+}
+
+inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
+
+    qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
+    qint8x16_t num   = vqsubq_qs8(exp2x, const_one);
+    qint8x16_t den   = vqaddq_qs8(exp2x, const_one);
+    qint8x16_t tanh  = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
+
+    return tanh;
+}
+
+inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
+}
+}
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 55f54dd..eaa50f1 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -30,6 +30,7 @@
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
 #include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
@@ -41,11 +42,13 @@
 #include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
 #include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
 #include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.h"
 #include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
@@ -62,12 +65,12 @@
 #include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
 #include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
 #include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
 #include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
 #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
 #include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
 #include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
 #include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
@@ -88,5 +91,6 @@
 #include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 
 #endif /* __ARM_COMPUTE_NEKERNELS_H__ */
diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
index d40e0c3..bb8a330 100644
--- a/arm_compute/core/NEON/NEMath.h
+++ b/arm_compute/core/NEON/NEMath.h
@@ -28,131 +28,46 @@
 
 namespace arm_compute
 {
-/* Exponent polynomial coefficients */
-const std::array<float32x4_t, 8> exp_tab =
-{
-    {
-        vdupq_n_f32(1.f),
-        vdupq_n_f32(0.0416598916054f),
-        vdupq_n_f32(0.500000596046f),
-        vdupq_n_f32(0.0014122662833f),
-        vdupq_n_f32(1.00000011921f),
-        vdupq_n_f32(0.00833693705499f),
-        vdupq_n_f32(0.166665703058f),
-        vdupq_n_f32(0.000195780929062f),
-    }
-};
-
-/* Logarithm polynomial coefficients */
-const std::array<float32x4_t, 8> log_tab =
-{
-    {
-        vdupq_n_f32(-2.29561495781f),
-        vdupq_n_f32(-2.47071170807f),
-        vdupq_n_f32(-5.68692588806f),
-        vdupq_n_f32(-0.165253549814f),
-        vdupq_n_f32(5.17591238022f),
-        vdupq_n_f32(0.844007015228f),
-        vdupq_n_f32(4.58445882797f),
-        vdupq_n_f32(0.0141278216615f),
-    }
-};
-
 /** Calculate inverse square root.
  *
- * @param x Input value.
+ * @param[in] x Input value.
  *
  * @return The calculated inverse square root.
  */
-inline float32x4_t vinvsqrtq_f32(float32x4_t x)
-{
-    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-
-    return sqrt_reciprocal;
-}
+float32x4_t vinvsqrtq_f32(float32x4_t x);
 
 /** Calculate reciprocal.
  *
- * @param x Input value.
+ * @param[in] x Input value.
  *
  * @return The calculated reciprocal.
  */
-inline float32x4_t vinvq_f32(const float32x4_t &x)
-{
-    float32x4_t recip = vrecpeq_f32(x);
-    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
-    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
-    return recip;
-}
+float32x4_t vinvq_f32(float32x4_t x);
 
 /** Perform a 7th degree polynomial approximation using Estrin's method.
  *
- * @param x       Input vector value in F32 format.
- * @param coeffs  Polynomial coefficients table.
+ * @param[in] x      Input vector value in F32 format.
+ * @param[in] coeffs Polynomial coefficients table.
  *
  * @return The calculated approximation.
  */
-inline float32x4_t vtaylor_polyq_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
-{
-    float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
-    float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
-    float32x4_t C   = vmlaq_f32(coeffs[1], coeffs[5], x);
-    float32x4_t D   = vmlaq_f32(coeffs[3], coeffs[7], x);
-    float32x4_t x2  = vmulq_f32(x, x);
-    float32x4_t x4  = vmulq_f32(x2, x2);
-    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
-    return res;
-}
+float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs);
 
 /** Calculate exponential
  *
- * @param x Input vector value in F32 format.
+ * @param[in] x Input vector value in F32 format.
  *
  * @return The calculated exponent.
  */
-inline float32x4_t vexpq_f32(const float32x4_t &x)
-{
-    static const float32x4_t CONST_LN2     = vdupq_n_f32(0.6931471805f); // ln(2)
-    static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
-
-    // Perform range reduction [-log(2),log(2)]
-    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2));
-    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
-
-    // Polynomial Approximation
-    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
-
-    // Reconstruct
-    poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
-
-    return poly;
-}
+float32x4_t vexpq_f32(float32x4_t x);
 
 /** Calculate logarithm
  *
- * @param x Input vector value in F32 format.
+ * @param[in] x Input vector value in F32 format.
  *
  * @return The calculated logarithm.
  */
-inline float32x4_t vlogq_f32(const float32x4_t &x)
-{
-    static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
-    static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
-
-    // Extract exponent
-    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
-    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
-
-    // Polynomial Approximation
-    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
-
-    // Reconstruct
-    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
-
-    return poly;
-}
+float32x4_t vlogq_f32(float32x4_t x);
 
 /** Calculate hyperbolic tangent.
  *
@@ -160,38 +75,22 @@
  *
  * @note We clamp x to [-5,5] to avoid overflowing issues.
  *
- * @param val Input vector value in F32 format.
+ * @param[in] val Input vector value in F32 format.
  *
  * @return The calculated Hyperbolic Tangent.
  */
-inline float32x4_t vtanhq_f32(const float32x4_t &val)
-{
-    static const float32x4_t CONST_1        = vdupq_n_f32(1.f);  // 1.f
-    static const float32x4_t CONST_2        = vdupq_n_f32(2.f);  // 2.f
-    static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-5.f); // -5.f
-    static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(5.f);  // 5.f
-
-    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
-    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
-    float32x4_t num   = vsubq_f32(exp2x, CONST_1);
-    float32x4_t den   = vaddq_f32(exp2x, CONST_1);
-    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
-    return tanh;
-}
+float32x4_t vtanhq_f32(float32x4_t val);
 
 /** Calculate n power of a number.
  *
  * pow(x,n) = e^(n*log(x))
  *
- * @param val Input vector value in F32 format.
- * @param n   Powers to raise the input to.
+ * @param[in] val Input vector value in F32 format.
+ * @param[in] n   Powers to raise the input to.
  *
  * @return The calculated power.
  */
-inline float32x4_t vpowq_f32(const float32x4_t &val, const float32x4_t &n)
-{
-    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
+float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
 }
-}
-
+#include "arm_compute/core/NEON/NEMath.inl"
 #endif /* __ARM_COMPUTE_NEMATH_H__ */
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
new file mode 100644
index 0000000..a31a4c0
--- /dev/null
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+namespace arm_compute
+{
+/* Exponent polynomial coefficients */
+const std::array<float32x4_t, 8> exp_tab =
+{
+    {
+        vdupq_n_f32(1.f),
+        vdupq_n_f32(0.0416598916054f),
+        vdupq_n_f32(0.500000596046f),
+        vdupq_n_f32(0.0014122662833f),
+        vdupq_n_f32(1.00000011921f),
+        vdupq_n_f32(0.00833693705499f),
+        vdupq_n_f32(0.166665703058f),
+        vdupq_n_f32(0.000195780929062f),
+    }
+};
+
+/* Logarithm polynomial coefficients */
+const std::array<float32x4_t, 8> log_tab =
+{
+    {
+        vdupq_n_f32(-2.29561495781f),
+        vdupq_n_f32(-2.47071170807f),
+        vdupq_n_f32(-5.68692588806f),
+        vdupq_n_f32(-0.165253549814f),
+        vdupq_n_f32(5.17591238022f),
+        vdupq_n_f32(0.844007015228f),
+        vdupq_n_f32(4.58445882797f),
+        vdupq_n_f32(0.0141278216615f),
+    }
+};
+
+inline float32x4_t vinvsqrtq_f32(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x4_t vinvq_f32(float32x4_t x)
+{
+    float32x4_t recip = vrecpeq_f32(x);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    return recip;
+}
+
+inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs)
+{
+    float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
+    float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
+    float32x4_t C   = vmlaq_f32(coeffs[1], coeffs[5], x);
+    float32x4_t D   = vmlaq_f32(coeffs[3], coeffs[7], x);
+    float32x4_t x2  = vmulq_f32(x, x);
+    float32x4_t x4  = vmulq_f32(x2, x2);
+    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
+    return res;
+}
+
+inline float32x4_t vexpq_f32(float32x4_t x)
+{
+    static const float32x4_t CONST_LN2     = vdupq_n_f32(0.6931471805f); // ln(2)
+    static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2));
+    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
+
+    // Reconstruct
+    poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
+
+    return poly;
+}
+
+inline float32x4_t vlogq_f32(float32x4_t x)
+{
+    static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
+    static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
+
+    // Extract exponent
+    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
+    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
+
+    // Reconstruct
+    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
+
+    return poly;
+}
+
+inline float32x4_t vtanhq_f32(float32x4_t val)
+{
+    static const float32x4_t CONST_1        = vdupq_n_f32(1.f);
+    static const float32x4_t CONST_2        = vdupq_n_f32(2.f);
+    static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-10.f);
+    static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(10.f);
+
+    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
+    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
+    float32x4_t num   = vsubq_f32(exp2x, CONST_1);
+    float32x4_t den   = vaddq_f32(exp2x, CONST_1);
+    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
+    return tanh;
+}
+
+inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
+{
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
+}
+}
\ No newline at end of file
diff --git a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
index 39f92e3..9ef93ce 100644
--- a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
+++ b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
@@ -65,8 +65,8 @@
 private:
     /** Common signature for all the specialised absolute difference functions
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8, S16.
-     * @param[in]  input2 An input tensor. Data types supported: U8, S16.
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16.
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16.
      * @param[out] output The output tensor, Data types supported: U8 (Only if both inputs are U8), S16.
      * @param[in]  window Region on which to execute the kernel.
      */
diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
index ba93c59..97f92d6 100644
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
@@ -24,6 +24,7 @@
 #ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
 #define __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
 
+#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
@@ -46,7 +47,7 @@
     NEActivationLayerKernel &operator=(NEActivationLayerKernel &&) = default;
     /** Set the input and output tensor.
      *
-     * @param[in]  input           Source tensor. Data types supported: F32.
+     * @param[in]  input           Source tensor. Data types supported: QS8/F32.
      * @param[out] output          Destination tensor. Data type supported: same as @p input
      * @param[in]  activation_info Activation layer information.
      */
@@ -66,8 +67,14 @@
      *
      *  @param[in] window Region on which to execute the kernel
      */
-    template <ActivationLayerInfo::ActivationFunction F>
-    void activation(const Window &window);
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, float>::value, void>::type activation(const Window &window);
+    /** Function to apply an activation function on a tensor.
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, qint8_t>::value, void>::type activation(const Window &window);
 
 private:
     ActivationFunctionExecutorPtr _func;
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
index 46d2292..b36ca46 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
@@ -50,9 +50,9 @@
 
     /** Initialise the kernel's input, output and border mode.
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8, S16, F32
-     * @param[in]  input2 An input tensor. Data types supported: U8, S16, F32 (only if @p input1 is F32).
-     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F32 (only if both inputs are F32).
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
      * @param[in]  policy Overflow policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
@@ -63,9 +63,9 @@
 private:
     /** Common signature for all the specialised add functions
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8, S16, F32.
-     * @param[in]  input2 An input tensor. Data types supported: U8, S16, F32 (only if @p input1 is F32).
-     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F32 (only if both inputs are F32).
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32.
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
      * @param[in]  window Region on which to execute the kernel.
      */
     using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
index a1dcb73..0eb9c23 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
@@ -50,9 +50,9 @@
 
     /** Initialise the kernel's input, output and border mode.
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8, S16, F32
-     * @param[in]  input2 An input tensor. Data types supported: U8, S16, F32  (only if @p input1 is F32).
-     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F32 (only if both inputs are F32).
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32  (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
      * @param[in]  policy Overflow policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
new file mode 100644
index 0000000..29fcbd2
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the batch normalization layer kernel.
+ */
+class NEBatchNormalizationLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBatchNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayerKernel(const NEBatchNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayerKernel &operator=(const NEBatchNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NEBatchNormalizationLayerKernel(NEBatchNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    NEBatchNormalizationLayerKernel &operator=(NEBatchNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEBatchNormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: QS8/F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using BatchNormFunction = void(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window);
+    BatchNormFunction *_func;
+    const ITensor     *_input;
+    ITensor           *_output;
+    const ITensor     *_mean;
+    const ITensor     *_var;
+    const ITensor     *_gamma;
+    const ITensor     *_beta;
+    float              _epsilon;
+};
+}
+#endif /*__ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
index b808dc1..f6bc215 100644
--- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h
+++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
@@ -66,7 +66,7 @@
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input          The input tensor to convert. Data types supported: F32
+     * @param[in]  input          The input tensor to convert. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
      * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
      *                            while the rest represent batch of outputs. Data types supported: Same as @p input
      * @param[in]  convolved_dims Output convolved dimensions.
@@ -77,8 +77,22 @@
     void run(const Window &window) override;
 
 private:
-    const ITensor *_input;
-    ITensor       *_output;
+    /** Template function to run the col2im
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_col2im(const Window &window);
+
+    /** Common signature for all the specialised col2im functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using Col2ImFunctionPtr = void (NECol2ImKernel::*)(const Window &window);
+
+    Col2ImFunctionPtr _func;
+    const ITensor    *_input;
+    ITensor          *_output;
     std::pair<unsigned int, unsigned int> _convolved_dims;
 };
 }
diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
new file mode 100644
index 0000000..7384cd1
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__
+#define __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the depth concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class NEDepthConcatenateKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDepthConcatenateKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConcatenateKernel(const NEDepthConcatenateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConcatenateKernel &operator=(const NEDepthConcatenateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDepthConcatenateKernel(NEDepthConcatenateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDepthConcatenateKernel &operator=(NEDepthConcatenateKernel &&) = default;
+    /** Default destructor */
+    ~NEDepthConcatenateKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor. Data types supported: F32.
+     * @param[in]     depth_offset The offset on the Z axis.
+     * @param[in,out] output       Output tensor. Data types supported: F32.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ITensor *input, unsigned int depth_offset, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    int            _top_bottom;
+    int            _left_right;
+    unsigned int   _depth_offset;
+};
+}
+#endif /* __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h
index e92e09b..0c5c29e 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h
@@ -43,15 +43,15 @@
      *
      * Valid conversions Input -> Output :
      *
-     *   - U8 -> U16, S16, U32, S32
-     *   - U16 -> U8, U32, S32
-     *   - S16 -> U8, U32, S32
-     *   - U32 -> U8, U16, S16
-     *   - S32 -> U8, U16, S16
+     *   - QS8 -> F32
+     *   - U8 -> U16, S16, S32
+     *   - U16 -> U8, U32
+     *   - S16 -> U8, S32
+     *   - F32 -> QS8
      *
      *
-     * @param[in]  input  The input tensor to convert. Data types supported: U8, U16, S16, U32 or S32.
-     * @param[out] output The output tensor. Data types supported: U8, U16, S16, U32 or S32.
+     * @param[in]  input  The input tensor to convert. Data types supported: U8/QS8/U16/S16/F32.
+     * @param[out] output The output tensor. Data types supported: U8/QS8/U16/S16/U32/S32/F32.
      * @param[in]  policy Conversion policy.
      * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
      */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
new file mode 100644
index 0000000..f098e18
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+/** NEON kernel to accumulate the biases to each element of the input tensor
+ *
+ * @note We assume bias to be shared
+ */
+class NEDirectConvolutionLayerBiasAccumulateKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDirectConvolutionLayerBiasAccumulateKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerBiasAccumulateKernel(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerBiasAccumulateKernel &operator=(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerBiasAccumulateKernel(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerBiasAccumulateKernel &operator=(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default;
+    /** Default destructor */
+    ~NEDirectConvolutionLayerBiasAccumulateKernel() = default;
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+     *                        Data type supported: QS8/F32
+     * @param[in]      bias   The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
+     * @param[out]     output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     *                         Data type supported: Same as @p input
+     */
+    void configure(ITensor *input, const ITensor *bias, ITensor *output = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using BiasAccumulateKernel = void(ITensor *input, const ITensor *bias, const Window window, ITensor *output);
+
+private:
+    BiasAccumulateKernel *_func;
+    ITensor              *_input;
+    const ITensor        *_bias;
+    ITensor              *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
new file mode 100644
index 0000000..d726071
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON interface for Direct Convolution Layer kernel */
+class NEDirectConvolutionLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDirectConvolutionLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerKernel(const NEDirectConvolutionLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerKernel &operator=(const NEDirectConvolutionLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerKernel(NEDirectConvolutionLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerKernel &operator=(NEDirectConvolutionLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEDirectConvolutionLayerKernel() = default;
+    /** Set the input, weights and output tensors.
+      *
+      * @param[in]  input     Input tensor. Data types supported: QS8/F32.
+      * @param[in]  weights   Set of kernels to convolve the input volume.
+      *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
+      *                       Data type supported: Same as @p input.
+      * @param[out] output    Output tensor.
+      *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+      */
+    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;
+    const ITensor *_weights;
+    ITensor       *_output;
+    PadStrideInfo  _conv_info;
+    BorderSize     _border_size;
+    unsigned int   _kernel_size;
+    unsigned int   _num_elems_read_per_iteration;
+    unsigned int   _num_elems_written_per_iteration;
+};
+}
+#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
index 0829cc7..3ec6611 100644
--- a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
@@ -53,7 +53,7 @@
      *
      * @note This kernel fills the borders within the XY-planes.
      *
-     * @param[in,out] tensor                Tensor to process. Data types supported: U8, S16, S32, F32.
+     * @param[in,out] tensor                Tensor to process. Data types supported: U8/S8/QS8/QS16/S16/S32/F32.
      * @param[in]     border_size           Size of the border to fill in elements.
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
index 1c8ef32..61e6e46 100644
--- a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
@@ -53,7 +53,7 @@
      *
      * @note This kernel fills the borders within the XY-planes.
      *
-     * @param[in,out] input                 Tensor to process. Data types supported: U8, S16, S32, F32.
+     * @param[in,out] input                 Tensor to process. Data types supported: U8/QS8/S16/S32/F32.
      * @param[in]     border_size           Size of the border to fill in elements.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      *
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 7790cf1..b9884ff 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -56,7 +56,7 @@
     NEGEMMInterleave4x4Kernel();
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
@@ -67,7 +67,7 @@
 private:
     /** Common signature for all the transpose functions
      *
-     * @param[in]  input  An input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
      * @param[out] output The output tensor. Data type supported: same as @p input
      * @param[in]  window Region on which to execute the kernel.
      */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
index 7d6806d..c0ecafc 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
@@ -47,7 +47,7 @@
     ~NEGEMMMatrixAccumulateBiasesKernel() = default;
     /** Set the accumulate buffer and the biases of the kernel.
      *
-     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: F32
+     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: QS8/F32
      * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
      */
     void configure(ITensor *accum, const ITensor *biases);
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
index d1eccec..1ab52fa 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
@@ -55,17 +55,27 @@
      *
      * @note The input and output tensor must have the same dimensions
      *
-     * @param[in]      input  Input tensor (Matrix C). Data types supported: F32, F16.
+     * @param[in]      input  Input tensor (Matrix C). Data types supported: QS8/F16/F32
      * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
      * @param[in]      beta   Weight of matrix C
      */
-    void configure(const ITensor *input, ITensor *output, const float beta);
+    void configure(const ITensor *input, ITensor *output, float beta);
 
     // Inherited methods overridden:
     void run(const Window &window) override;
 
 private:
-    float _beta;
+    /** Common signature for all the matrix addition functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: QS8/F16/F32
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     * @param[in]  beta   Weight of matrix C
+     */
+    using MatrixAdditionFunction = void(const ITensor *input, ITensor *output, const Window &window, float beta);
+    /** Matrix addition function to use for the particular tensor types passed to configure() */
+    MatrixAdditionFunction *_func;
+    float                   _beta;
 };
 }
 #endif /* __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
index f45fb0f..a684945 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
@@ -54,7 +54,7 @@
      * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
      *       These two kernels change the layout of the original matrices to be more cache-friendly.
      *
-     * @param[in]  input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F32, F16.
+     * @param[in]  input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
      * @param[in]  input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
      *                    If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
      * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
index 416b55f..5d8a369 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -30,9 +30,9 @@
 {
 class ITensor;
 
-/** NEON kernel which transposes the elements of a matrix in chunks of 1x4 if the input data type is F32 or in chunks of 1x8 if the input data type is F16.
+/** NEON kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
  *
- * Following an example of how the transposition1xW works when the input data type is F32
+ * Following an example of how the transposition1xW works when the input data is F32
  *
  * @f[
  * \left( \begin{array}{cccc}
@@ -62,8 +62,7 @@
  * \end{array} \right)
  * @f]
  *
- * @note If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
- * @note If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
+ * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
  *
  */
 class NEGEMMTranspose1xWKernel : public INESimpleKernel
@@ -71,7 +70,7 @@
 public:
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: F32, 16.
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
index 24fa032..dd85778 100644
--- a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
@@ -53,7 +53,7 @@
      *
      * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
      * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[out] output          Output tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
      * @param[in]  hog_info        HOG's metadata
      */
     void configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info);
diff --git a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
index bda213b..e56d1e5 100644
--- a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
@@ -79,6 +79,7 @@
     size_t                 _block_stride_height;
     size_t                 _detection_window_width;
     size_t                 _detection_window_height;
+    size_t                 _max_num_detection_windows;
     std::mutex             _mutex;
 };
 }
diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
index 3bcd686..0abd73e 100644
--- a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
@@ -57,7 +57,7 @@
 public:
     /** Setup the kernel parameters
      *
-     * @param[in]  input1           Source image (gradient X). Data types supported: S16, S32
+     * @param[in]  input1           Source image (gradient X). Data types supported: S16/S32
      * @param[in]  input2           Source image (gradient Y). Data types supported: same as @ input1
      * @param[out] output           Destination image (harris score). Data types supported: F32
      * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index ba5077a..ebaafb4 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -72,7 +72,7 @@
     /** Set the input and output of the kernel.
      *
      * @param[in]  input          The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                            while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32
+     *                            while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32
      * @param[out] output         The output tensor. Data types supported: Same as @p input
      * @param[in]  convolved_dims The convolved output dimensions.
      * @param[in]  conv_info      Contains padding and stride information described in @ref PadStrideInfo.
@@ -84,15 +84,17 @@
     void run(const Window &window) override;
 
 private:
-    /** Run the im2col used for the convolution layer case
+    /** Template function to run the im2col optimised for the fully connected layer case
      *
      * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
+    template <typename T>
     void run_reduced(const Window &window);
-    /** Run the im2col optimised for the fully connected layer case
+    /** Template function to run the im2col used for the convolution layer case
      *
      * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
+    template <typename T, bool has_pads>
     void run_generic(const Window &window);
     /** Common signature for all the specialised im2col functions
      *
diff --git a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
new file mode 100644
index 0000000..d4bff66
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply each row of first tensor with low 2 dimensions of second tensor. */
+class NELocallyConnectedMatrixMultiplyKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELocallyConnectedMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELocallyConnectedMatrixMultiplyKernel(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELocallyConnectedMatrixMultiplyKernel &operator=(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELocallyConnectedMatrixMultiplyKernel(NELocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELocallyConnectedMatrixMultiplyKernel &operator=(NELocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  input0 First input tensor. Data types supported: F32
+     * @param[in]  input1 Second input tensor containing the Matrix B. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
index bd84810..0daae59 100644
--- a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
@@ -54,7 +54,7 @@
 
     /** Initialise the kernel's sources, destinations and border mode.
      *
-     * @param[in]  input            Source tensor. Data types supported: U8, F32
+     * @param[in]  input            Source tensor. Data types supported: U8/F32
      * @param[out] output           Destination tensor. Data types supported: same as @p input
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
      */
@@ -86,7 +86,7 @@
 public:
     /** Initialise the kernel's sources, destinations and border mode.
      *
-     * @param[in]  input            Source tensor. Data types supported: U8, F32.
+     * @param[in]  input            Source tensor. Data types supported: U8/F32.
      * @param[out] output           Destination tensor. Data types supported: same as @p input
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
      */
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index 18d198c..d4e36d5 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -47,11 +47,10 @@
     NENormalizationLayerKernel &operator=(NENormalizationLayerKernel &&) = default;
     /** Default destructor */
     ~NENormalizationLayerKernel() = default;
-
     /** Set the input and output tensors.
      *
      * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: F32.
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/F32.
      * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                           Data type supported: same as @p input
      * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
@@ -64,17 +63,34 @@
     BorderSize border_size() const override;
 
 private:
-    /** Function to perform normalization depending on the given templates dimension.
+    /** Function to perform normalization depending on the given template
+     *  dimension. The second template parameter specifies whether the
+     *  normalization has to be 1D or 2D.
      *
-     * @note Only normalization across X and Z is currently supported and tested.
+     * @note Only supported normalizations are:
+     *  - 1D over X or Z
+     *  - 2D over X and Y
      *
-     * @param window Region on which to execute the kernel.
+     * @param[in] window Region on which to execute the kernel.
      */
-    template <unsigned int dim>
+    template <unsigned int dim, bool do_2D_norm>
     void normalize(const Window &window);
+
+    /** Function to perform normalization for fixed-point values depending on
+     * the given template dimension. The second template parameter specifies
+     * whether the normalization has to be 1D or 2D.
+     *
+     * @note Only supported normalizations are:
+     *  - 1D over X or Z
+     *  - 2D over X and Y
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <unsigned int dim, bool do_2D_norm>
+    void normalize_fixed_point(const Window &window);
     /** Common signature for all the specialised normalization functions
      *
-     * @param window  Region on which to execute the kernel.
+     * @param[in] window Region on which to execute the kernel.
      */
     using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window);
 
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index 0891d0c..7e402cd 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -47,15 +47,14 @@
     NEPixelWiseMultiplicationKernel &operator=(NEPixelWiseMultiplicationKernel &&) = default;
     /** Default destructor */
     ~NEPixelWiseMultiplicationKernel() = default;
-
     /** Initialise the kernel's input, output and border mode.
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in]  input1          An input tensor. Data types supported: U8, S16, F32.
-     * @param[in]  input2          An input tensor. Data types supported: U8, S16, F32.
-     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F32.
+     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[in]  input2          An input tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8) /S16/F32.
      * @param[in]  scale           Scale to apply after multiplication.
      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]  overflow_policy Overflow policy.
@@ -71,19 +70,29 @@
      *
      * @param[in]  input1_ptr Pointer to the first input tensor.
      * @param[in]  input2_ptr Pointer to the second input tensor.
-     * @param[out] output_ptr Pointer to the output tensor
+     * @param[out] output_ptr Pointer to the output tensor.
      */
     using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale);
+    /** Common signature for all the specialised multiplication functions with fixed-point values
+     *
+     * @param[in]  input1_ptr           Pointer to the first input tensor.
+     * @param[in]  input2_ptr           Pointer to the second input tensor.
+     * @param[in]  scale                Scaling factor.
+     * @param[in]  fixed_point_position Fixed-point position that expresses the number of bits for the fractional part of the number.
+     * @param[out] output_ptr           Pointer to the output tensor.
+     */
+    using MulFunctionQInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale, int fixed_point_position);
     /** Common signature for all the specialised multiplication functions with float scaling factor
      *
      * @param[in]  input1_ptr Pointer to the first input tensor.
      * @param[in]  input2_ptr Pointer to the second input tensor.
-     * @param[out] output_ptr Pointer to the output tensor
+     * @param[out] output_ptr Pointer to the output tensor.
      */
     using MulFunctionFloat = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale);
 
     MulFunctionFloat *_func_float;
     MulFunctionInt   *_func_int;
+    MulFunctionQInt *_func_q_int;
 
 private:
     const ITensor *_input1;
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 728b2ff..62a0878 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -46,10 +46,9 @@
     NEPoolingLayerKernel &operator=(NEPoolingLayerKernel &&) = default;
     /** Default destructor */
     ~NEPoolingLayerKernel() = default;
-
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data types supported: F32.
+     * @param[in]  input     Source tensor. Data types supported: QS8/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
@@ -66,14 +65,28 @@
      * @param[in] window       Output region on which to execute the kernel.
      */
     template <PoolingType pooling_type>
-    void pooling2(const Window &window_input, const Window &window);
+    void pooling2_f32(const Window &window_input, const Window &window);
+    /** Function to perform 2x2 pooling for 8bit fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling2_q8(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling.
      *
      * @param[in] window_input Input region on which to execute the kernel.
      * @param[in] window       Output region on which to execute the kernel.
      */
     template <PoolingType pooling_type>
-    void pooling3(const Window &window_input, const Window &window);
+    void pooling3_f32(const Window &window_input, const Window &window);
+    /** Function to perform 3x3 pooling for 8bit fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling3_q8(const Window &window_input, const Window &window);
     /** Common signature for all the specialised Pooling functions
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -86,6 +99,7 @@
     const ITensor   *_input;
     ITensor         *_output;
     PoolingLayerInfo _pool_info;
+    int              _num_elems_processed_per_iteration;
     BorderSize       _border_size;
 };
 }
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
index 0f11e7e..03e2652 100644
--- a/arm_compute/core/NEON/kernels/NEScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h
@@ -52,11 +52,11 @@
      *
      * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
      *
-     * @param[in]  input            Source tensor. Data types supported: U8 or S16.
+     * @param[in]  input            Source tensor. Data types supported: U8/S16.
      * @param[in]  dx               Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
      * @param[in]  dy               Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
      * @param[in]  offsets          Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[out] output           Destination tensor. Data types supported: U8 or S16. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[out] output           Destination tensor. Data types supported: U8/S16. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  policy           Interpolation type to use
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
      */
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
index 83d55d3..ab626ad 100644
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
@@ -39,7 +39,7 @@
     NELogits1DMaxKernel();
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: F32.
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
@@ -49,7 +49,11 @@
     BorderSize border_size() const override;
 
 private:
-    BorderSize _border_size;
+    using Logits1DMaxFunction = void(const ITensor *in, ITensor *out, const Window &window);
+
+private:
+    Logits1DMaxFunction *_func;
+    BorderSize           _border_size;
 };
 
 /** Interface for shifting the logits values around the max value and exponentiating the result */
@@ -68,10 +72,9 @@
     NELogits1DShiftExpSumKernel &operator=(NELogits1DShiftExpSumKernel &&) = default;
     /** Default destructor */
     ~NELogits1DShiftExpSumKernel() = default;
-
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: F32.
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
      * @param[in]  max    Max values tensor. Data types supported: same as @p input.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
      * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input.
@@ -80,14 +83,16 @@
 
     // Inherited methods overridden:
     void run(const Window &window) override;
-    BorderSize border_size() const override;
 
 private:
-    const ITensor *_input;
-    const ITensor *_max;
-    ITensor       *_output;
-    ITensor       *_sum;
-    BorderSize     _border_size;
+    using Logits1DShiftExpSumFunction = void(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window);
+
+private:
+    Logits1DShiftExpSumFunction *_func;
+    const ITensor               *_input;
+    const ITensor               *_max;
+    ITensor                     *_output;
+    ITensor                     *_sum;
 };
 
 /** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
@@ -106,10 +111,9 @@
     NELogits1DNormKernel &operator=(NELogits1DNormKernel &&) = default;
     /** Default destructor */
     ~NELogits1DNormKernel() = default;
-
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: F32.
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
      * @param[in]  sum    Sum tensor. The number of dimensions should be dim(input)-1. Data types supported: same as @p input.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
      */
@@ -119,9 +123,13 @@
     void run(const Window &window) override;
 
 private:
-    const ITensor *_input;
-    const ITensor *_sum;
-    ITensor       *_output;
+    using Logits1DNormFunction = void(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window);
+
+private:
+    Logits1DNormFunction *_func;
+    const ITensor        *_input;
+    const ITensor        *_sum;
+    ITensor              *_output;
 };
 }
 #endif /*__ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NETableLookupKernel.h b/arm_compute/core/NEON/kernels/NETableLookupKernel.h
index 499b87f..b3963e5 100644
--- a/arm_compute/core/NEON/kernels/NETableLookupKernel.h
+++ b/arm_compute/core/NEON/kernels/NETableLookupKernel.h
@@ -47,7 +47,7 @@
     NETableLookupKernel &operator=(NETableLookupKernel &&) = default;
     /** Initialise the kernel's input, lut and output.
      *
-     * @param[in]  input  An input tensor. Data types supported: U8, S16.
+     * @param[in]  input  An input tensor. Data types supported: U8/S16.
      * @param[in]  lut    The input LUT.
      * @param[out] output The output tensor. Data types supported: same as @p input
      */
diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h
index 4d82383..ac9449f 100644
--- a/arm_compute/core/NEON/kernels/NETransposeKernel.h
+++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h
@@ -53,7 +53,7 @@
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
@@ -64,7 +64,7 @@
 private:
     /** Common signature for all the transpose functions
      *
-     * @param[in]  input  An input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
      * @param[out] output The output tensor. Data type supported: same as @p input
      * @param[in]  window Region on which to execute the kernel.
      */
diff --git a/arm_compute/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
similarity index 65%
rename from arm_compute/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.h
rename to arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
index 6057b2f..cad2d00 100644
--- a/arm_compute/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_NECONVOLUTIONLAYERWEIGHTSRESHAPEKERNEL_H__
-#define __ARM_COMPUTE_NECONVOLUTIONLAYERWEIGHTSRESHAPEKERNEL_H__
+#ifndef __ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__
+#define __ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__
 
 #include "arm_compute/core/NEON/INEKernel.h"
 
@@ -30,7 +30,7 @@
 {
 class ITensor;
 
-/** NEON kernel to perform reshaping on the weights used by convolution layer.
+/** NEON kernel to perform reshaping on the weights used by convolution and locally connected layer
  *
  * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
  * In combination with the @ref NEIm2ColKernel can transform a convolution to a matrix multiplication.
@@ -53,27 +53,28 @@
  * \end{array} \right)
  * @f]
  */
-class NEConvolutionLayerWeightsReshapeKernel : public INEKernel
+class NEWeightsReshapeKernel : public INEKernel
 {
 public:
-    /** Default constructor */
-    NEConvolutionLayerWeightsReshapeKernel();
+    /** Constructor.*/
+    NEWeightsReshapeKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionLayerWeightsReshapeKernel(const NEConvolutionLayerWeightsReshapeKernel &) = delete;
+    NEWeightsReshapeKernel(const NEWeightsReshapeKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionLayerWeightsReshapeKernel &operator=(const NEConvolutionLayerWeightsReshapeKernel &) = delete;
+    NEWeightsReshapeKernel &operator=(const NEWeightsReshapeKernel &) = delete;
     /** Allow instances of this class to be moved */
-    NEConvolutionLayerWeightsReshapeKernel(NEConvolutionLayerWeightsReshapeKernel &&) = default;
+    NEWeightsReshapeKernel(NEWeightsReshapeKernel &&) = default;
     /** Allow instances of this class to be moved */
-    NEConvolutionLayerWeightsReshapeKernel &operator=(NEConvolutionLayerWeightsReshapeKernel &&) = default;
+    NEWeightsReshapeKernel &operator=(NEWeightsReshapeKernel &&) = default;
     /** Default destructor */
-    ~NEConvolutionLayerWeightsReshapeKernel() = default;
-
+    ~NEWeightsReshapeKernel() = default;
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data types supported: F32
-     * @param[in]  bias   The shared bias tensor to append. Biases are 1D tensor with dimensions [OFM]. Data types supported: Same as @p input
-     * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
+     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/F32
+     * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     * @param[out] output The output tensor. Data types supported: Same as @p input
      */
     void configure(const ITensor *input, const ITensor *bias, ITensor *output);
 
@@ -81,11 +82,13 @@
     void run(const Window &window) override;
 
 private:
-    const ITensor *_input;
-    const ITensor *_bias;
-    ITensor       *_output;
-    bool           _has_bias;
+    using WeightsReshapeKernel = void(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window);
+
+    WeightsReshapeKernel *_func;
+    const ITensor        *_input;
+    const ITensor        *_bias;
+    ITensor              *_output;
 };
 }
 
-#endif /*__ARM_COMPUTE_NECONVOLUTIONLAYERWEIGHTSRESHAPEKERNEL_H__ */
+#endif /*__ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index 8a45444..b4912ce 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h
@@ -101,6 +101,7 @@
             uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
             float    f32;     /**< Single channel float 32 */
             uint8_t  u8;      /**< Single channel U8 */
+            int8_t   s8;      /**< Single channel S8 */
             uint16_t u16;     /**< Single channel U16 */
             int16_t  s16;     /**< Single channel S16 */
             uint32_t u32;     /**< Single channel U32 */
@@ -114,6 +115,14 @@
     {
         v = value.u8;
     }
+    /** Interpret the pixel value as a S8
+     *
+     * @param[out] v Returned value
+     */
+    void get(int8_t &v) const
+    {
+        v = value.s8;
+    }
     /** Interpret the pixel value as a U16
      *
      * @param[out] v Returned value
diff --git a/arm_compute/core/Steps.h b/arm_compute/core/Steps.h
index 4706c81..33a88a2 100644
--- a/arm_compute/core/Steps.h
+++ b/arm_compute/core/Steps.h
@@ -40,7 +40,6 @@
 class Steps : public Dimensions<unsigned int>
 {
 public:
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
     /** Constructor to initialize the steps.
      *
      * @param[in] steps Values to initialize the steps.
@@ -52,7 +51,6 @@
         // Initialize empty dimensions to 1
         std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
     }
-#endif
     /** Allow instances of this class to be copy constructed */
     constexpr Steps(const Steps &) = default;
     /** Allow instances of this class to be copied */
diff --git a/arm_compute/core/Strides.h b/arm_compute/core/Strides.h
index efdeb11..329fafb 100644
--- a/arm_compute/core/Strides.h
+++ b/arm_compute/core/Strides.h
@@ -38,7 +38,6 @@
 class Strides : public Dimensions<size_t>
 {
 public:
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
     /** Constructor to initialize the strides.
      *
      * @param[in] strides Values to initialize the strides.
@@ -48,7 +47,6 @@
         : Dimensions{ strides... }
     {
     }
-#endif
     /** Allow instances of this class to be copy constructed */
     constexpr Strides(const Strides &) = default;
     /** Allow instances of this class to be copied */
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
new file mode 100644
index 0000000..e2532fd
--- /dev/null
+++ b/arm_compute/core/SubTensorInfo.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SUBTENSORINFO_H__
+#define __ARM_COMPUTE_SUBTENSORINFO_H__
+
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Store the sub tensor's metadata */
+class SubTensorInfo final : public ITensorInfo
+{
+public:
+    /** Default constructor */
+    SubTensorInfo();
+    /** Default constructor
+     *
+     * @param[in] parent       Metadata of parent tensor.
+     * @param[in] tensor_shape Tensor shape. Shape must fit inside parent's shape.
+     *                         X and Y dimensions must match the parent's ones.
+     * @param[in] coords       Coordinates of starting element inside parent tensor.
+     */
+    SubTensorInfo(ITensorInfo *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    /** Default destructor */
+    ~SubTensorInfo() = default;
+    /** Allow instances of this class to be copy constructed */
+    SubTensorInfo(const SubTensorInfo &) = default;
+    /** Allow instances of this class to be copied */
+    SubTensorInfo &operator=(const SubTensorInfo &) = default;
+    /** Allow instances of this class to be move constructed */
+    SubTensorInfo(SubTensorInfo &&) = default;
+    /** Allow instances of this class to be moved */
+    SubTensorInfo &operator=(SubTensorInfo &&) = default;
+
+    // Inherited methods overridden:
+    void set_data_type(DataType data_type) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_data_type(data_type);
+    };
+    void set_num_channels(int num_channels) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_num_channels(num_channels);
+    };
+    void set_format(Format format) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_format(format);
+    };
+    void set_fixed_point_position(int fixed_point_position) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_fixed_point_position(fixed_point_position);
+    };
+    void set_tensor_shape(TensorShape shape) override;
+    bool auto_padding() override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->auto_padding();
+    };
+    bool extend_padding(const PaddingSize &padding) override;
+    size_t dimension(size_t index) const override
+    {
+        return _tensor_shape[index];
+    }
+    const Strides &strides_in_bytes() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->strides_in_bytes();
+    }
+    size_t offset_first_element_in_bytes() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->offset_element_in_bytes(_coords);
+    }
+    size_t offset_element_in_bytes(const Coordinates &pos) const override;
+    int fixed_point_position() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->fixed_point_position();
+    }
+    size_t element_size() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->element_size();
+    }
+    size_t num_dimensions() const override
+    {
+        return _tensor_shape.num_dimensions();
+    }
+    size_t num_channels() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->num_channels();
+    }
+    const TensorShape &tensor_shape() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _tensor_shape;
+    }
+    DataType data_type() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->data_type();
+    }
+    Format format() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->format();
+    }
+    size_t total_size() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->total_size();
+    }
+    PaddingSize padding() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->padding();
+    }
+    bool has_padding() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->has_padding();
+    }
+    bool is_resizable() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->is_resizable();
+    }
+    void set_is_resizable(bool is_resizable) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_is_resizable(is_resizable);
+    }
+    ValidRegion valid_region() const override
+    {
+        return _valid_region;
+    }
+    void set_valid_region(ValidRegion valid_region) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region);
+        _valid_region = std::move(valid_region);
+    }
+
+private:
+    ITensorInfo *_parent;
+    TensorShape  _tensor_shape;
+    Coordinates  _coords;
+    ValidRegion  _valid_region;
+};
+}
+#endif /*__ARM_COMPUTE_SUBTENSORINFO_H__ */
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index b1f7db0..35b9ccb 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -24,6 +24,8 @@
 #ifndef __ARM_COMPUTE_TENSORINFO_H__
 #define __ARM_COMPUTE_TENSORINFO_H__
 
+#include "arm_compute/core/ITensorInfo.h"
+
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
@@ -37,13 +39,15 @@
 class HOGInfo;
 
 /** Store the tensor's metadata */
-class TensorInfo
+class TensorInfo final : public ITensorInfo
 {
 public:
     /** Default constructor */
     TensorInfo();
     /** Default destructor */
-    virtual ~TensorInfo() = default;
+    ~TensorInfo() = default;
+    /** Allow instances of this class to be copy constructed */
+    TensorInfo(const ITensorInfo &info);
     /** Allow instances of this class to be copy constructed */
     TensorInfo(const TensorInfo &) = default;
     /** Allow instances of this class to be copied */
@@ -52,6 +56,15 @@
     TensorInfo(TensorInfo &&) = default;
     /** Allow instances of this class to be moved */
     TensorInfo &operator=(TensorInfo &&) = default;
+
+    /** Construct a tensor info with a format.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] format Format of the tensor.
+     */
+    TensorInfo(Format format);
+
     /** 2D tensor constructor
      *
      * @param[in] width  Width of the 2D tensor
@@ -65,15 +78,25 @@
      * @param[in] format       Single plane format of the tensor.
      */
     TensorInfo(const TensorShape &tensor_shape, Format format);
+
+    /** Construct a tensor info with a data type and number of channels.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] num_channels         It indicates the number of channels for each tensor element
+     * @param[in] data_type            Data type to use for each tensor element
+     * @param[in] fixed_point_position (Optional) It specifies the fixed point position when the tensor data type is QS8, QS16 or QS32.
+     */
+    TensorInfo(size_t num_channels, DataType data_type, size_t fixed_point_position = 0);
+
     /** Constructor
      *
-     * @param[in] tensor_shape    It specifies the size for each dimension of the tensor in number of elements.
-     * @param[in] num_channels    It indicates the number of channels for each tensor element
-     * @param[in] data_type       Data type to use for each tensor element
-     * @param[in] fixed_point_pos (Optional) It specifies the fixed point position when the tensor data type is INT8, INT16 or INT32. (Default = 0)
-                                  If 0, calculations are performed in integer math
+     * @param[in] tensor_shape         It specifies the size for each dimension of the tensor in number of elements.
+     * @param[in] num_channels         It indicates the number of channels for each tensor element
+     * @param[in] data_type            Data type to use for each tensor element
+     * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
      */
-    TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, size_t fixed_point_pos = 0);
+    TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
     /** Constructor
      *
      * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
@@ -81,6 +104,15 @@
      * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
      */
     TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height);
+
+    /** Initialize the tensor info with just a format.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] format Single plane format of the tensor.
+     */
+    void init(Format format);
+
     /** Initialize the metadata structure with the given parameters
      *
      * @param[in] tensor_shape Size for each dimension of the tensor in number of elements.
@@ -96,15 +128,25 @@
      * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
      */
     void init(const TensorShape &tensor_shape, Format format, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, size_t total_size_in_bytes);
+
+    /** Initialize the tensor info with just a format.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] num_channels         Desired number of channels for each tensor element.
+     * @param[in] data_type            Data type to use for each tensor element.
+     * @param[in] fixed_point_position (Optional) Fixed point position when the tensor data type is QS8, QS16 or QS32.
+     */
+    void init(size_t num_channels, DataType data_type, size_t fixed_point_position = 0);
+
     /** Initialize the metadata structure with the given parameters
      *
-     * @param[in] tensor_shape    Size for each dimension of the tensor in number of elements.
-     * @param[in] num_channels    Desired number of channels for each tensor element.
-     * @param[in] data_type       Data type to use for each tensor element.
-     * @param[in] fixed_point_pos (Optional) Fixed point position when the tensor data type is INT8, INT16 or INT32 (default = 0).
-     *                            If 0, calculations are performed in integer arithmetic.
+     * @param[in] tensor_shape         Size for each dimension of the tensor in number of elements.
+     * @param[in] num_channels         Desired number of channels for each tensor element.
+     * @param[in] data_type            Data type to use for each tensor element.
+     * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
      */
-    void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, size_t fixed_point_pos = 0);
+    void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
     /** Initialize the metadata structure with the given parameters
      *
      * @param[in] tensor_shape                  Size for each dimension of the tensor in number of elements.
@@ -113,11 +155,10 @@
      * @param[in] strides_in_bytes              Stride in bytes for accessing each dimension of the tensor.
      * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
      * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
-     * @param[in] fixed_point_pos               (Optional) Fixed point position when the tensor data type is INT8, INT16 or INT32 (default = 0).
-     *                                          If 0, calculations are performed in integer arithmetic.
+     * @param[in] fixed_point_position          (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
      */
     void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-              size_t total_size_in_bytes, size_t fixed_point_pos = 0);
+              size_t total_size_in_bytes, int fixed_point_position = 0);
     /** Initialize the metadata structure for the given HOG's metadata
      *
      * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
@@ -140,15 +181,14 @@
      *
      * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
      *
-     * @param[in] tensor_shape    It specifies the size for each dimension of the tensor in number of elements
-     * @param[in] num_channels    It indicates the number of channels for each tensor element
-     * @param[in] data_type       Data type to use for each tensor element
-     * @param[in] fixed_point_pos (Optional) It specifies the fixed point position when the tensor data type is INT8, INT16 or INT32. (Default = 0)
-     *                            If 0, calculations are performed in integer math
+     * @param[in] tensor_shape         It specifies the size for each dimension of the tensor in number of elements
+     * @param[in] num_channels         It indicates the number of channels for each tensor element
+     * @param[in] data_type            Data type to use for each tensor element
+     * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
      *
      * @return Total allocation size including padding in bytes.
      */
-    size_t init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, size_t fixed_point_pos = 0);
+    size_t init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
     /** Initialize the metadata structure for the given HOG's metadata
      *
      * @note init_auto_padding will be used for the tensor initialization.
@@ -158,167 +198,81 @@
      * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
      */
     size_t init_auto_padding(const HOGInfo &hog_info, unsigned int width, unsigned int height);
-    /** Update the offset to the first element and the strides to automatically computed values.
-     *
-     * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
-     *
-     * @return True if the strides or the offset to the first element have changed.
-     */
-    bool auto_padding();
-    /** Update the offset to the first element, the strides and the total size.
-     *
-     * @note This function can only increase the offset, strides and total size.
-     *
-     * @param[in] padding Padding around the XY plane in number of elements.
-     *
-     * @return True if the strides, offset and total size have changed.
-     */
-    bool extend_padding(const PaddingSize &padding);
-    /** Set the format of an already initialized tensor.
-     *
-     * @note The passed format must be compatible with the existing number of channels and data type of the tensor.
-     *
-     * @param[in] format Single-plane format of the tensor.
-     */
-    void set_format(Format format);
-    /** Return the size of the requested dimension
-     *
-     * @param[in] index Index of the dimension
-     *
-     * @return Dimension of the requested dimension
-     */
-    size_t dimension(size_t index) const
+
+    // Inherited methods overridden:
+    void set_data_type(DataType data_type) override;
+    void set_num_channels(int num_channels) override;
+    void set_format(Format format) override;
+    void set_tensor_shape(TensorShape shape) override;
+    void set_fixed_point_position(int fixed_point_position) override;
+    bool auto_padding() override;
+    bool extend_padding(const PaddingSize &padding) override;
+    size_t dimension(size_t index) const override
     {
         return _tensor_shape[index];
     }
-    /** The strides in bytes for accessing each dimension of the tensor
-     *
-     * @return Strides in bytes for each tensor dimension
-     */
-    const Strides &strides_in_bytes() const
+    const Strides &strides_in_bytes() const override
     {
         return _strides_in_bytes;
     }
-    /** The offset from the beginning of the memory allocation to the first element of the tensor.
-     *  This can be used to access efficiently elements in a 2D tensor
-     *
-     * @return The offset in bytes to access the first element of the tensor.
-     */
-    size_t offset_first_element_in_bytes() const
+    size_t offset_first_element_in_bytes() const override
     {
         return _offset_first_element_in_bytes;
     }
-    /** The offset in bytes from the beginning of the memory allocation to access the element at position (x, y, z ...)
-     *
-     * @param[in] pos Vector with the coordinates of the element to access.
-     *                The size of this vector must be equal to the number of dimensions of the tensor
-     *
-     * @return Offset in bytes from the beginning of the memory allocation to access the element (x, y, z, ...)
-     */
-    size_t offset_element_in_bytes(const Coordinates &pos) const;
-    /** Fixed point position used when the tensor data type is S8, S16 or S32.
-     *
-     * @return The fixed point position
-     */
-    size_t fixed_point_pos() const
+    size_t offset_element_in_bytes(const Coordinates &pos) const override;
+    int fixed_point_position() const override
     {
-        return _fixed_point_pos;
+        return _fixed_point_position;
     }
-    /** Element size in bytes calculated as data_size() * num_channels
-     *
-     * @return The size of one element in bytes
-     */
-    size_t element_size() const
+    size_t element_size() const override
     {
         return data_size_from_type(_data_type) * _num_channels;
     }
-    /** The number of dimensions of the tensor (rank)
-     *
-     * @return The number of dimensions of the tensor (rank)
-     */
-    size_t num_dimensions() const
+    size_t num_dimensions() const override
     {
         return _tensor_shape.num_dimensions();
     }
-    /** The number of channels for each tensor element
-     *
-     * @return The number of channels for each tensor element
-     */
-    size_t num_channels() const
+    size_t num_channels() const override
     {
         return _num_channels;
     }
-    /** Size for each dimension of the tensor
-     *
-     * @return A vector with the size for each dimension of the tensor
-     */
-    const TensorShape &tensor_shape() const
+    const TensorShape &tensor_shape() const override
     {
         return _tensor_shape;
     }
-    /** Data type used for each element of the tensor
-     *
-     * @return Tensor data type
-     */
-    DataType data_type() const
+    DataType data_type() const override
     {
         return _data_type;
     }
-    /** Colour format of the image
-     *
-     * @return Colour format of the image
-     */
-    Format format() const
+    Format format() const override
     {
         return _format;
     }
-    /** Returns the total size of the tensor in bytes.
-     *
-     * @return Total size of the tensor in bytes.
-     */
-    size_t total_size() const
+    size_t total_size() const override
     {
         return _total_size;
     }
-    /** Padding of tensor.
-     *
-     * @return Padding.
-     */
-    PaddingSize padding() const
+    PaddingSize padding() const override
     {
         return _padding;
     }
-    /** Checks if the tensor has been allocated with padding or not.
-     *
-     * @return True if padding is allocated in the tensor, otherwise false.
-     */
-    bool has_padding() const
+    bool has_padding() const override
     {
         return !_padding.empty();
     }
-    /** Flag indicating whether the size of the tensor can be changed.
-     *
-     * @return True if the tensor size can be changed.
-     */
-    bool is_resizable() const
+    bool is_resizable() const override
     {
         return _is_resizable;
     }
-    /** Set the flag whether the tensor size can be changed. */
-    void set_is_resizable(bool is_resizable)
+    void set_is_resizable(bool is_resizable) override
     {
         _is_resizable = is_resizable;
     }
-    /** Valid region of the tensor. All elements in the valid region have defined values, i.e. are not undefined.
-     *
-     * @return The valid region.
-     */
-    ValidRegion valid_region() const
+    ValidRegion valid_region() const override
     {
         return _valid_region;
     }
-    /** Set the valid region of the tensor. */
-    void set_valid_region(ValidRegion valid_region)
+    void set_valid_region(ValidRegion valid_region) override
     {
         _valid_region = std::move(valid_region);
     }
@@ -331,7 +285,7 @@
     std::tuple<Strides, size_t, size_t> calculate_padding_requirements(const PaddingSize &padding);
 
     size_t      _total_size;
-    size_t      _fixed_point_pos;
+    int         _fixed_point_position;
     size_t      _offset_first_element_in_bytes;
     Strides     _strides_in_bytes;
     size_t      _num_channels;
diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h
index 3ac6298..f8b3181 100644
--- a/arm_compute/core/TensorShape.h
+++ b/arm_compute/core/TensorShape.h
@@ -38,7 +38,6 @@
 class TensorShape : public Dimensions<size_t>
 {
 public:
-#ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
     /** Constructor to initialize the tensor shape.
      *
      * @param[in] dims Values to initialize the dimensions.
@@ -47,10 +46,15 @@
     TensorShape(Ts... dims)
         : Dimensions{ dims... }
     {
-        // Initialize empty dimensions to 1
-        std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+        // Initialize unspecified dimensions to 1
+        if(_num_dimensions > 0)
+        {
+            std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+        }
+
+        // Correct number dimensions to ignore trailing dimensions of size 1
+        apply_dimension_correction();
     }
-#endif
     /** Allow instances of this class to be copy constructed */
     TensorShape(const TensorShape &) = default;
     /** Allow instances of this class to be copied */
@@ -61,15 +65,47 @@
     TensorShape &operator=(TensorShape &&) = default;
     /** Default destructor */
     ~TensorShape() = default;
+
+    /** Accessor to set the value of one of the dimensions.
+     *
+     * @param[in] dimension Dimension for which the value is set.
+     * @param[in] value     Value to be set for the dimension.
+     */
+    void set(size_t dimension, size_t value)
+    {
+        ARM_COMPUTE_ERROR_ON(value < 1);
+
+        // Make sure all empty dimensions are filled with 1
+        std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+
+        // Set the specified dimension and increase the number of dimensions if
+        // necessary
+        Dimensions::set(dimension, value);
+
+        // Correct number dimensions to ignore trailing dimensions of size 1
+        apply_dimension_correction();
+    }
+
+    /** Collapse the first n dimensions.
+     *
+     * @param[in] first Dimensions into which the following @p n are collapsed.
+     * @param[in] n     Number of dimensions to collapse into @p first.
+     */
+    void collapse(size_t n, size_t first = 0)
+    {
+        Dimensions::collapse(n, first);
+
+        // Make sure all empty dimensions are filled with 1
+        std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+    }
+
     /** Collapses all dimensions to a single linear total size.
      *
      * @return The total tensor size in terms of elements.
      */
     size_t total_size() const
     {
-        const size_t size = std::accumulate(_id.begin(), _id.end(), 1, std::multiplies<size_t>());
-        ARM_COMPUTE_ERROR_ON(0 == size);
-        return size;
+        return std::accumulate(_id.begin(), _id.end(), 1, std::multiplies<size_t>());
     }
     /** Collapses given dimension and above.
      *
@@ -81,9 +117,24 @@
      */
     size_t total_size_upper(size_t dimension) const
     {
-        const size_t size = std::accumulate(_id.begin() + dimension, _id.end(), 1, std::multiplies<size_t>());
-        ARM_COMPUTE_ERROR_ON(0 == size);
-        return size;
+        return std::accumulate(_id.begin() + dimension, _id.end(), 1, std::multiplies<size_t>());
+    }
+
+private:
+    /** Remove trailing dimensions of size 1 from the reported number of dimensions. */
+    void apply_dimension_correction()
+    {
+        for(int i = static_cast<int>(_num_dimensions) - 1; i >= 0; --i)
+        {
+            if(_id[i] == 1)
+            {
+                --_num_dimensions;
+            }
+            else
+            {
+                break;
+            }
+        }
     }
 };
 }
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 6188d58..725567b 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -62,8 +62,10 @@
     UNKNOWN,
     U8,
     S8,
+    QS8,
     U16,
     S16,
+    QS16,
     U32,
     S32,
     U64,
@@ -182,6 +184,14 @@
         return size;
     }
 
+    void limit(const BorderSize &limit)
+    {
+        top    = std::min(top, limit.top);
+        right  = std::min(right, limit.right);
+        bottom = std::min(bottom, limit.bottom);
+        left   = std::min(left, limit.left);
+    }
+
     unsigned int top;
     unsigned int right;
     unsigned int bottom;
@@ -223,7 +233,8 @@
 enum class RoundingPolicy
 {
     TO_ZERO,        /**< Truncates the least significand values that are lost in operations. */
-    TO_NEAREST_EVEN /**< Rounds to nearest even output value */
+    TO_NEAREST_UP,  /**< Rounds to nearest value; half rounds up */
+    TO_NEAREST_EVEN /**< Rounds to nearest value; half rounds to nearest even */
 };
 
 /** Termination criteria */
@@ -326,17 +337,17 @@
 /** The normalization type used for the normalization layer */
 enum class NormType
 {
-    IN_MAP,   /* Normalization applied within the same map */
-    CROSS_MAP /* Normalization applied cross maps */
+    IN_MAP_1D, /**< Normalization applied within the same map in 1D region */
+    IN_MAP_2D, /**< Normalization applied within the same map in 2D region */
+    CROSS_MAP  /**< Normalization applied cross maps */
 };
 
 /** Normalization type for Histogram of Oriented Gradients (HOG) */
 enum class HOGNormType
 {
-    L2_NORM,    /**< L2-norm */
-    L2HYS_NORM, /**< L2-norm followed by clipping */
-    L1_NORM,    /**< L1 norm */
-    L1SQRT_NORM /**< L1 norm with SQRT */
+    L2_NORM    = 1, /**< L2-norm */
+    L2HYS_NORM = 2, /**< L2-norm followed by clipping */
+    L1_NORM    = 3  /**< L1 norm */
 };
 
 /** Detection window used for the object detection. The detection window keeps the following information:
@@ -497,7 +508,7 @@
 public:
     /** Default Constructor
      *
-     * @param[in] type      The normalization type. Can be @ref NormType::IN_MAP or NORM_TYPE::CROSS_MAP
+     * @param[in] type      The normalization type. Can be @ref NormType::IN_MAP_1D, @ref NormType::IN_MAP_2D or @ref NORM_TYPE::CROSS_MAP
      * @param[in] norm_size The normalization size is the number of elements to normalize across. Defaults to 5.
      * @param[in] alpha     Alpha parameter used by normalization equation. Defaults to 0.0001.
      * @param[in] beta      Beta parameter used by normalization equation. Defaults to 0.5.
@@ -527,12 +538,17 @@
     {
         return _kappa;
     }
-    /** Return the scaling factor of the normalization function. If kappa is not 1 then [Krichevksy 2012] normalization scaling is specified.
+    /** Return the scaling factor of the normalization function. If kappa is not
+     * 1 then [Krichevksy 2012] normalization scaling is specified. Scaling
+     * factor takes into account the total number of elements used for the
+     * normalization, so in case of 2 dimensions this is _norm_size^2.
+     *
      * @return The normalization scaling factor.
      */
     float scale_coeff() const
     {
-        return (_kappa == 1.f) ? (_alpha / _norm_size) : _alpha;
+        const uint32_t size = (_type == NormType::IN_MAP_2D) ? _norm_size * _norm_size : _norm_size;
+        return (_kappa == 1.f) ? (_alpha / size) : _alpha;
     }
 
 private:
@@ -543,6 +559,38 @@
     float    _kappa;
 };
 
+/** Convolution Layer Weights Information class */
+class WeightsInfo
+{
+public:
+    WeightsInfo()
+        : _are_reshaped(false), _kernel_size(0)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] are_reshaped True if the weights have been reshaped
+     * @param[in] kernel_size  The size of the kernel.
+     */
+    WeightsInfo(bool are_reshaped, unsigned int kernel_size)
+        : _are_reshaped(are_reshaped), _kernel_size(kernel_size)
+    {
+    }
+
+    bool are_reshaped() const
+    {
+        return _are_reshaped;
+    };
+    unsigned int kernel_size() const
+    {
+        return _kernel_size;
+    }
+
+private:
+    const bool         _are_reshaped;
+    const unsigned int _kernel_size;
+};
+
 /** IO formatting information class*/
 struct IOFormatInfo
 {
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 3ebf3ff..9d3ff0a 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -33,6 +33,7 @@
 #include <numeric>
 #include <sstream>
 #include <string>
+#include <type_traits>
 #include <utility>
 
 namespace arm_compute
@@ -101,10 +102,12 @@
     {
         case DataType::U8:
         case DataType::S8:
+        case DataType::QS8:
             return 1;
         case DataType::U16:
         case DataType::S16:
         case DataType::F16:
+        case DataType::QS16:
             return 2;
         case DataType::F32:
         case DataType::U32:
@@ -170,10 +173,13 @@
 {
     switch(dt)
     {
+        case DataType::S8:
         case DataType::U8:
+        case DataType::QS8:
             return 1;
         case DataType::U16:
         case DataType::S16:
+        case DataType::QS16:
         case DataType::F16:
             return 2;
         case DataType::U32:
@@ -536,14 +542,14 @@
 
 /** Returns expected width and height of output scaled tensor depending on dimensions rounding mode.
  *
- * @param width         Width of input tensor (Number of columns)
- * @param height        Height of input tensor (Number of rows)
- * @param kernel_size   Kernel size.
- * @param stride_x      Stride of the operation in the x dimension.
- * @param stride_y      Stride of the operation in the y dimension.
- * @param pad_x         Padding size in the x dimension.
- * @param pad_y         Padding size in the y dimension.
- * @param round_type    Dimensions rounding mode.
+ * @param[in] width       Width of input tensor (Number of columns)
+ * @param[in] height      Height of input tensor (Number of rows)
+ * @param[in] kernel_size Kernel size.
+ * @param[in] stride_x    Stride of the operation in the x dimension.
+ * @param[in] stride_y    Stride of the operation in the y dimension.
+ * @param[in] pad_x       Padding size in the x dimension.
+ * @param[in] pad_y       Padding size in the y dimension.
+ * @param[in] round_type  Dimensions rounding mode.
  *
  * @return A pair with the new width in the first position and the new height in the second.
  */
@@ -610,14 +616,27 @@
  * @return The string describing the border mode.
  */
 const std::string &string_from_border_mode(BorderMode border_mode);
+/** Translates a given normalization type to a string.
+ *
+ * @param[in] type @ref NormType to be translated to string.
+ *
+ * @return The string describing the normalization type.
+ */
+const std::string &string_from_norm_type(NormType type);
 /** Lower a given string.
  *
- * @param val Given string to lower.
+ * @param[in] val Given string to lower.
  *
  * @return The lowered string
  */
-std::string lower_string(std::string val);
+std::string lower_string(const std::string &val);
 
+/** Check if a given data type is of floating point type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of floating point type, else false.
+ */
 inline bool is_data_type_float(DataType dt)
 {
     switch(dt)
@@ -630,6 +649,24 @@
     }
 }
 
+/** Check if a given data type is of fixed point type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of fixed point type, else false.
+ */
+inline bool is_data_type_fixed_point(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::QS8:
+        case DataType::QS16:
+            return true;
+        default:
+            return false;
+    }
+}
+
 /** Print consecutive elements to an output stream.
  *
  * @param[out] s             Output stream to print the elements to.
@@ -641,6 +678,8 @@
 template <typename T>
 void print_consecutive_elements_impl(std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ")
 {
+    using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
+
     for(unsigned int i = 0; i < n; ++i)
     {
         // Set stream width as it is not a "sticky" stream manipulator
@@ -648,28 +687,29 @@
         {
             s.width(stream_width);
         }
-        s << std::right << ptr[i] << element_delim;
+        s << std::right << static_cast<print_type>(ptr[i]) << element_delim;
     }
 }
 
 /** Identify the maximum width of n consecutive elements.
  *
- * @param[in] s The output stream which will be used to print the elements. Used to extract the stream format.
- *
- * @param ptr    Pointer to the elements.
- * @param n      Number of elements.
+ * @param[in] s   The output stream which will be used to print the elements. Used to extract the stream format.
+ * @param[in] ptr Pointer to the elements.
+ * @param[in] n   Number of elements.
  *
  * @return The maximum width of the elements.
  */
 template <typename T>
 int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, unsigned int n)
 {
+    using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
+
     int max_width = -1;
     for(unsigned int i = 0; i < n; ++i)
     {
         std::stringstream ss;
         ss.copyfmt(s);
-        ss << ptr[i];
+        ss << static_cast<print_type>(ptr[i]);
         max_width = std::max<int>(max_width, ss.str().size());
     }
     return max_width;
diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
index 5f1c541..48eba70 100644
--- a/arm_compute/core/Validate.h
+++ b/arm_compute/core/Validate.h
@@ -37,6 +37,82 @@
 
 namespace arm_compute
 {
+namespace detail
+{
+/* Check whether two dimension objects differ.
+ *
+ * @param[in] dim1      First object to be compared.
+ * @param[in] dim2      Second object to be compared.
+ * @param[in] upper_dim The dimension from which to check.
+ *
+ * @return Return true if the two objects are different.
+ */
+template <typename T>
+inline bool have_different_dimensions(const Dimensions<T> &dim1, const Dimensions<T> &dim2, unsigned int upper_dim)
+{
+    for(unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
+    {
+        if(dim1[i] != dim2[i])
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/** Functor to compare two @ref Dimensions objects and throw an error on mismatch.
+ *
+ * @param[in] dim      Object to compare against.
+ * @param[in] function Function in which the error occured.
+ * @param[in] file     File in which the error occured.
+ * @param[in] line     Line in which the error occured.
+ */
+template <typename T>
+class compare_dimension
+{
+public:
+    compare_dimension(const Dimensions<T> &dim, const char *function, const char *file, int line)
+        : _dim{ dim }, _function{ function }, _file{ file }, _line{ line }
+    {
+    }
+
+    /** Compare the given object against the stored one.
+     *
+     * @param[in] dim To be compared object.
+     */
+    void operator()(const Dimensions<T> &dim)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC_MSG(have_different_dimensions(_dim, dim, 0), _function, _file, _line,
+                                     "Objects have different dimensions");
+    }
+
+private:
+    const Dimensions<T> &_dim;
+    const char *const    _function;
+    const char *const    _file;
+    const int            _line;
+};
+} // namespace detail
+/** Throw an error if one of the pointers is a nullptr.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] pointers Pointers to check against nullptr.
+ */
+template <typename... Ts>
+void error_on_nullptr(const char *function, const char *file, const int line, Ts &&... pointers)
+{
+    auto is_nullptr = [&](const void *ptr)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(ptr == nullptr, function, file, line);
+    };
+
+    for_each(is_nullptr, std::forward<Ts>(pointers)...);
+}
+#define ARM_COMPUTE_ERROR_ON_NULLPTR(...) ::arm_compute::error_on_nullptr(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
 /** Throw an error if the passed window is invalid.
  *
  * The subwindow is invalid if:
@@ -99,27 +175,28 @@
                                     const Window &win, unsigned int max_dim);
 #define ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) ::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md)
 
-/* Check whether two tensors have different shapes.
+/** Throw an error if the passed dimension objects differ.
  *
- * @param[in] tensor_1 First tensor to be compared
- * @param[in] tensor_2 Second tensor to be compared
- *
- * @return Return true if the two tensors have different shapes
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] dim1     The first object to be compared.
+ *  @param[in] dim2     The second object to be compared.
+ *  @param[in] dims     (Optional) Further allowed objects.
  */
-inline bool have_different_shapes(const ITensor *tensor_1, const ITensor *tensor_2)
+template <typename T, typename... Ts>
+void error_on_mismatching_dimensions(const char *function, const char *file, int line,
+                                     const Dimensions<T> &dim1, const Dimensions<T> &dim2, Ts &&... dims)
 {
-    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
-    {
-        if(tensor_1->info()->dimension(i) != tensor_2->info()->dimension(i))
-        {
-            return true;
-        }
-    }
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
 
-    return false;
+    for_each(detail::compare_dimension<T>(dim1, function, file, line), dim2, std::forward<Ts>(dims)...);
 }
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...) ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__)
 
-/** Throw an error if the passed two tensors have different shapes
+/** Throw an error if the passed two tensors have different shapes from the given dimension
  *
  *  @param[in] function Function in which the error occurred.
  *  @param[in] file     Name of the file where the error occurred.
@@ -132,18 +209,36 @@
 void error_on_mismatching_shapes(const char *function, const char *file, const int line,
                                  const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
 {
+    error_on_mismatching_shapes(function, file, line, 0U, tensor_1, tensor_2, std::forward<Ts>(tensors)...);
+}
+
+/** Throw an error if the passed two tensors have different shapes from the given dimension
+ *
+ *  @param[in] function  Function in which the error occurred.
+ *  @param[in] file      Name of the file where the error occurred.
+ *  @param[in] line      Line on which the error occurred.
+ *  @param[in] upper_dim The dimension from which to check.
+ *  @param[in] tensor_1  The first tensor to be compared.
+ *  @param[in] tensor_2  The second tensor to be compared.
+ *  @param[in] tensors   (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_shapes(const char *function, const char *file, const int line,
+                                 unsigned int upper_dim, const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
     ARM_COMPUTE_UNUSED(function);
     ARM_COMPUTE_UNUSED(file);
     ARM_COMPUTE_UNUSED(line);
-    ARM_COMPUTE_UNUSED(tensor_1);
-    ARM_COMPUTE_UNUSED(tensor_2);
 
-    const std::array<const ITensor *, sizeof...(Ts)> tensors_array{ { std::forward<Ts>(tensors)... } };
+    const std::array < const ITensor *, 2 + sizeof...(Ts) > tensors_array{ { tensor_1, tensor_2, std::forward<Ts>(tensors)... } };
     ARM_COMPUTE_UNUSED(tensors_array);
 
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(have_different_shapes(tensor_1, tensor_2) || std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    ARM_COMPUTE_ERROR_ON_LOC(tensors_array.cbegin() == nullptr, function, file, line);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_array.cbegin()), tensors_array.cend(), [&](const ITensor * tensor)
     {
-        return have_different_shapes(tensor_1, tensor);
+        ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+        return detail::have_different_dimensions((*tensors_array.cbegin())->info()->tensor_shape(), tensor->info()->tensor_shape(), upper_dim);
     }),
     function, file, line, "Tensors have different shapes");
 }
@@ -183,6 +278,55 @@
 
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...) ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__)
 
+/** Throw an error if the passed tensors have different fixed point data types or different fixed point positions
+ *
+ * @note: If the first tensor doesn't have fixed point data type, the function returns without throwing an error
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor_1 The first tensor to be compared.
+ *  @param[in] tensor_2 The second tensor to be compared.
+ *  @param[in] tensors  (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_fixed_point(const char *function, const char *file, const int line,
+                                      const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(tensor_1);
+    ARM_COMPUTE_UNUSED(tensor_2);
+
+    DataType &&first_data_type            = tensor_1->info()->data_type();
+    const int  first_fixed_point_position = tensor_1->info()->fixed_point_position();
+    ARM_COMPUTE_UNUSED(first_data_type);
+    ARM_COMPUTE_UNUSED(first_fixed_point_position);
+
+    if((first_data_type != DataType::QS8) && (first_data_type != DataType::QS16))
+    {
+        return;
+    }
+
+    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_array{ { tensor_2, std::forward<Ts>(tensors)... } };
+    ARM_COMPUTE_UNUSED(tensors_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    {
+        return tensor->info()->data_type() != first_data_type;
+    }),
+    function, file, line, "Tensors have different fixed point data types");
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    {
+        return tensor->info()->fixed_point_position() != first_fixed_point_position;
+    }),
+    function, file, line, "Tensors have different fixed point positions");
+}
+
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(...) ::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
 /** Throw an error if the format of the passed tensor/multi-image does not match any of the formats provided.
  *
  *  @param[in] function Function in which the error occurred.
@@ -229,7 +373,7 @@
 {
     ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
 
-    DataType &&tensor_dt = tensor->info()->data_type();
+    const DataType &tensor_dt = tensor->info()->data_type(); //NOLINT
     ARM_COMPUTE_UNUSED(tensor_dt);
 
     ARM_COMPUTE_ERROR_ON_LOC(tensor_dt == DataType::UNKNOWN, function, file, line);
@@ -343,5 +487,77 @@
 void error_on_unconfigured_kernel(const char *function, const char *file, const int line,
                                   const IKernel *kernel);
 #define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k) ::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k)
+
+/** Throw an error if if the coordinates and shape of the subtensor are within the parent tensor.
+ *
+ * @param[in] function     Function in which the error occurred.
+ * @param[in] file         Name of the file where the error occurred.
+ * @param[in] line         Line on which the error occurred.
+ * @param[in] parent_shape Parent tensor shape
+ * @param[in] coords       Coordinates inside the parent tensor where the first element of the subtensor is
+ * @param[in] shape        Shape of the subtensor
+ */
+void error_on_invalid_subtensor(const char *function, const char *file, const int line,
+                                const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape);
+#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(p, c, s) ::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s)
+
+/** Throw an error if the valid region of a subtensor is not inside the valid region of the parent tensor.
+ *
+ * @param[in] function            Function in which the error occurred.
+ * @param[in] file                Name of the file where the error occurred.
+ * @param[in] line                Line on which the error occurred.
+ * @param[in] parent_valid_region Parent valid region.
+ * @param[in] valid_region        Valid region of subtensor.
+ */
+void error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
+                                             const ValidRegion &parent_valid_region, const ValidRegion &valid_region);
+#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv)
+
+/** Throw an error if the input fixed-point positions are different.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor_1 The first tensor to be compared.
+ *  @param[in] tensor_2 The second tensor to be compared.
+ *  @param[in] tensors  (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line,
+                                               const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_array{ { tensor_2, std::forward<Ts>(tensors)... } };
+    ARM_COMPUTE_UNUSED(tensors_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    {
+        return tensor->info()->fixed_point_position() != tensor_1->info()->fixed_point_position();
+    }),
+    function, file, line, "Tensors have different fixed-point positions");
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) ::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the fixed-point value is not representable in the specified Q format.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] value    The floating point value to be checked.
+ *  @param[in] tensor   Input tensor that has information on data type and fixed-point position.
+ */
+template <typename... Ts>
+void error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line,
+                                                     float value, const ITensor *tensor)
+{
+    const int          fixed_point_position = tensor->info()->fixed_point_position();
+    const DataType     dt                   = tensor->info()->data_type();
+    const unsigned int q_max_range          = 0xFFFFFFFFu >> (((sizeof(unsigned int) - element_size_from_data_type(dt)) * 8) + 1);
+    const float        max_range            = q_max_range / (static_cast<float>(1 << fixed_point_position));
+    ARM_COMPUTE_UNUSED(max_range);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(value > max_range, function, file, line,
+                                 "Value %f is not representable in %s with fixed-point position %d", value, string_from_data_type(dt).c_str(), fixed_point_position);
+}
+#define ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) ::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)
 }
 #endif /* __ARM_COMPUTE_VALIDATE_H__*/
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index 727a489..6e7ef22 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -30,7 +30,7 @@
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Utils.h"
 
 namespace arm_compute
@@ -160,7 +160,7 @@
      * @param[in] info            Tensor information to copy the dimensions from.
      * @param[in] first_dimension Only copy dimensions which are greater or equal to this value.
      */
-    void use_tensor_dimensions(const TensorInfo *info, size_t first_dimension = Window::DimX);
+    void use_tensor_dimensions(const ITensorInfo *info, size_t first_dimension = Window::DimX);
 
     /** Shift the values of a given dimension by the given shift_value
      *
@@ -169,6 +169,14 @@
      */
     void shift(size_t dimension, int shift_value);
 
+    /** Adjust the start or end of a given dimension by the given value
+     *
+     * @param[in] dimension    The dimension to adjust
+     * @param[in] adjust_value The adjusted value.
+     * @param[in] is_at_start  The flag to indicate whether adjust the start or end of the dimension.
+     */
+    void adjust(size_t dimension, int adjust_value, bool is_at_start);
+
     /** Scale the values of a given dimension by the given scale_value
      *
      * @note The end of the window is rounded up to be a multiple of step after the scaling.
@@ -273,6 +281,18 @@
     {
         return slide_window_slice<3>(slice);
     }
+    /** Slide the passed 4D window slice.
+     *
+     * If slice contains the last slice then it will remain unchanged and false will be returned.
+     *
+     * @param[in,out] slice Current slice, to be updated to the next slice.
+     *
+     * @return true if slice contains a new slice, false if slice already contained the last slice
+     */
+    bool slide_window_slice_4D(Window &slice) const
+    {
+        return slide_window_slice<4>(slice);
+    }
     /** Sets the ID of the thread that the window is associated with.
      *
      * @param id ID of the thread that the window is associated with.
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index 71bcaa3..75428a1 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl
@@ -50,6 +50,21 @@
     d                    = Window::Dimension(d.start() + shift_value, d.end() + shift_value, d.step());
 }
 
+inline void Window::adjust(size_t dimension, int adjust_value, bool is_at_start)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    Window::Dimension &d = _dims[dimension];
+
+    if(is_at_start)
+    {
+        d = Window::Dimension(d.start() + adjust_value, d.end(), d.step());
+    }
+    else
+    {
+        d = Window::Dimension(d.start(), d.end() + adjust_value, d.step());
+    }
+}
+
 inline void Window::scale(const size_t dimension, float scale_value)
 {
     ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
@@ -157,7 +172,7 @@
     return slice;
 }
 
-inline void Window::use_tensor_dimensions(const TensorInfo *info, const size_t first_dimension)
+inline void Window::use_tensor_dimensions(const ITensorInfo *info, const size_t first_dimension)
 {
     for(unsigned int n = first_dimension; n < info->num_dimensions(); ++n)
     {
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index d16354f..82929ba 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -30,6 +30,7 @@
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
 #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
 #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
@@ -41,6 +42,7 @@
 #include "arm_compute/runtime/CL/functions/CLColorConvert.h"
 #include "arm_compute/runtime/CL/functions/CLConvolution.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
 #include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
 #include "arm_compute/runtime/CL/functions/CLDerivative.h"
 #include "arm_compute/runtime/CL/functions/CLDilate.h"
@@ -55,11 +57,16 @@
 #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
 #include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
 #include "arm_compute/runtime/CL/functions/CLHistogram.h"
 #include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
 #include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
 #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLMagnitude.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
 #include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
diff --git a/arm_compute/runtime/CL/CLHOG.h b/arm_compute/runtime/CL/CLHOG.h
new file mode 100644
index 0000000..9b4a303
--- /dev/null
+++ b/arm_compute/runtime/CL/CLHOG.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOG_H__
+#define __ARM_COMPUTE_CLHOG_H__
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** OpenCL implementation of HOG data-object */
+class CLHOG : public ICLHOG
+{
+public:
+    /** Default constructor */
+    CLHOG();
+    /** Allocate the HOG descriptor using the given HOG's metadata
+     *
+     * @param[in] input HOG's metadata used to allocate the HOG descriptor
+     */
+    void init(const HOGInfo &input);
+
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLHOG::map;
+
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLHOG::unmap;
+
+    // Inherited method overridden:
+    void              free() override;
+    const HOGInfo    *info() const override;
+    const cl::Buffer &cl_buffer() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    HOGInfo    _info;
+    cl::Buffer _buffer;
+};
+}
+#endif /* __ARM_COMPUTE_CLHOG_H__ */
diff --git a/arm_compute/runtime/CL/CLMultiHOG.h b/arm_compute/runtime/CL/CLMultiHOG.h
new file mode 100644
index 0000000..17bb4e0
--- /dev/null
+++ b/arm_compute/runtime/CL/CLMultiHOG.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMULTIHOG_H__
+#define __ARM_COMPUTE_CLMULTIHOG_H__
+
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLHOG.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic implementation of the CL multi HOG data-objects */
+class CLMultiHOG : public ICLMultiHOG
+{
+public:
+    /** Constructor
+     *
+     * @param[in] num_models Number of HOG data objects to contain
+     *
+     */
+    CLMultiHOG(size_t num_models);
+
+    // Inherited methods overridden:
+    size_t  num_models() const override;
+    ICLHOG *cl_model(size_t index) override;
+    const ICLHOG *cl_model(size_t index) const override;
+
+private:
+    size_t                   _num_models;
+    std::unique_ptr<CLHOG[]> _model;
+};
+}
+#endif /*__ARM_COMPUTE_CLMULTIHOG_H__ */
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 71baa55..8e80259 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -24,8 +24,12 @@
 #ifndef __ARM_COMPUTE_CLSCHEDULER_H__
 #define __ARM_COMPUTE_CLSCHEDULER_H__
 
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
@@ -50,7 +54,7 @@
     void default_init()
     {
         CLKernelLibrary::get().init("./cl_kernels/", cl::Context::getDefault(), cl::Device::getDefault());
-        init(cl::Context::getDefault(), cl::CommandQueue::getDefault());
+        init(cl::Context::getDefault(), cl::CommandQueue::getDefault(), cl::Device::getDefault());
     }
     /** Schedule the execution of the passed kernel if possible.
      *
@@ -63,11 +67,14 @@
      *
      * @param[in] context A CL context.
      * @param[in] queue   A CL command queue.
+     * @param[in] device  A CL device.
      */
-    void init(cl::Context context = cl::Context::getDefault(), cl::CommandQueue queue = cl::CommandQueue::getDefault())
+    void init(cl::Context context = cl::Context::getDefault(), cl::CommandQueue queue = cl::CommandQueue::getDefault(),
+              cl::Device device = cl::Device::getDefault())
     {
         _context = std::move(context);
         _queue   = std::move(queue);
+        _target  = get_target_from_device(device);
     }
 
     /** Accessor for the associated CL context.
@@ -97,6 +104,15 @@
         return _queue;
     }
 
+    /** Get the target GPU.
+     *
+     * @return The target GPU.
+     */
+    GPUTarget target() const
+    {
+        return _target;
+    }
+
     /** Accessor to set the CL command queue to be used by the scheduler.
      *
      * @param[in] queue A CL command queue.
@@ -106,6 +122,15 @@
         _queue = std::move(queue);
     }
 
+    /** Accessor to set target GPU to be used by the scheduler.
+     *
+     * @param[in] target The target GPU.
+     */
+    void set_target(GPUTarget target)
+    {
+        _target = target;
+    }
+
     /** Blocks until all commands in the associated command queue have finished. */
     void sync()
     {
@@ -127,6 +152,7 @@
 private:
     cl::Context      _context;
     cl::CommandQueue _queue;
+    GPUTarget        _target;
 };
 }
 #endif /* __ARM_COMPUTE_CLSCHEDULER_H__ */
diff --git a/arm_compute/runtime/CL/CLSubTensor.h b/arm_compute/runtime/CL/CLSubTensor.h
new file mode 100644
index 0000000..4bab164
--- /dev/null
+++ b/arm_compute/runtime/CL/CLSubTensor.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSUBTENSOR_H__
+#define __ARM_COMPUTE_CLSUBTENSOR_H__
+
+#include "arm_compute/core/SubTensorInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorInfo;
+
+/** Basic implementation of the OpenCL sub-tensor interface */
+class CLSubTensor : public ICLTensor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] parent       Parent tensor
+     * @param[in] tensor_shape Shape of the subtensor
+     * @param[in] coords       Coordinates of the first subtensor element inside the parent tensor.
+     */
+    CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    /** Destructor: free the tensor's memory */
+    ~CLSubTensor() = default;
+    /** Restrict instances of this class to be copy constructed */
+    CLSubTensor(const CLSubTensor &) = delete;
+    /** Restrict instances of this class to be copied */
+    CLSubTensor &operator=(const CLSubTensor &) = delete;
+    /** Allow instances of this class to be move constructed */
+    CLSubTensor(CLSubTensor &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSubTensor &operator=(CLSubTensor &&) = default;
+
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @note Mapping a subtensor will lead to the mapping of the whole parent tensor for now.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLTensor::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note Unmapping a subtensor will lead to the unmapping of the whole parent tensor for now.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLTensor::unmap;
+
+    /** Return the parent tensor of the subtensor
+     *
+     * @return Parent tensor
+     */
+    ICLTensor *parent();
+
+    // Inherited methods overridden:
+    ITensorInfo      *info() const override;
+    ITensorInfo      *info() override;
+    const cl::Buffer &cl_buffer() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    ICLTensor            *_parent;
+    mutable SubTensorInfo _info;
+};
+}
+#endif /*__ARM_COMPUTE_CLSUBTENSOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
new file mode 100644
index 0000000..d766d1c
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLNormalizationLayerKernel and simulate a batch normalization layer.
+ *
+ * Batch normalization is calculated by:
+ * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
+ *
+ */
+class CLBatchNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLBatchNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLBatchNormalizationLayerKernel _norm_kernel; /**< BatchNormalization layer kernel to run */
+};
+}
+#endif /* __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index 2a9b487..6a40396 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -27,12 +27,12 @@
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-#include "arm_compute/core/CL/kernels/CLConvolutionLayerWeightsReshapeKernel.h"
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
@@ -42,6 +42,34 @@
 {
 class ICLTensor;
 
+/** Function to reshape and transpose the weights. This function calls the following kernels:
+ * -# @ref CLWeightsReshapeKernel
+ * -# @ref CLGEMMTranspose1xWKernel
+ */
+class CLConvolutionLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    CLConvolutionLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: F32.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] output       Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
+     *                          Data types supported: Same as @p weights.
+     */
+    void configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLConvolutionLayerWeightsReshapeKernel _weights_reshape_kernel;
+    CLGEMMTranspose1xWKernel               _weights_transposed_kernel;
+    CLTensor                               _weights_reshaped;
+    bool                                   _transpose1xW;
+};
+
 /** Basic function to compute the convolution layer. This function calls the following OpenCL kernels:
  *
  * -# @ref CLConvolutionLayerWeightsReshapeKernel (executed only once for each configuration)
@@ -58,35 +86,36 @@
     CLConvolutionLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F16, F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                          while every optional dimension from 4 and above represent a batch of inputs.
+     *                          Data types supported: F16, F32.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                          Data types supported: Same as @p input.
+     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    CLIm2ColKernel                         _input_im2col_kernel;
-    CLConvolutionLayerWeightsReshapeKernel _weights_reshape_kernel;
-    CLGEMMInterleave4x4Kernel              _input_interleave_kernel;
-    CLGEMMTranspose1xWKernel               _weights_transposed_kernel;
-    CLGEMMMatrixMultiplyKernel             _mm_kernel;
-    CLCol2ImKernel                         _output_col2im_kernel;
-    CLTensor                               _input_im2col_reshaped;
-    CLTensor                               _input_interleaved_reshaped;
-    CLTensor                               _weights_reshaped;
-    CLTensor                               _weights_transposed;
-    CLTensor                               _gemm_output;
-    bool                                   _is_first_run;
-    bool                                   _has_bias;
-    bool                                   _is_fc;
+    CLConvolutionLayerReshapeWeights _reshape_weights;
+    CLIm2ColKernel                   _input_im2col_kernel;
+    CLGEMMInterleave4x4Kernel        _input_interleave_kernel;
+    CLGEMMMatrixMultiplyKernel       _mm_kernel;
+    CLCol2ImKernel                   _output_col2im_kernel;
+    CLTensor                         _input_im2col_reshaped;
+    CLTensor                         _input_interleaved_reshaped;
+    CLTensor                         _weights_reshaped;
+    CLTensor                         _weights_transposed;
+    CLTensor                         _gemm_output;
+    bool                             _has_bias;
+    bool                             _is_fully_connected_convolution;
+    bool                             _are_weights_reshaped;
 };
 }
 #endif /* __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
new file mode 100644
index 0000000..3199936
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHCONCATENATE_H__
+#define __ARM_COMPUTE_CLDEPTHCONCATENATE_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+class CLDepthConcatenateKernel;
+class CLFillBorderKernel;
+
+/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
+ * -# @ref CLDepthConcatenateKernel
+ *
+ */
+class CLDepthConcatenate : public IFunction
+{
+public:
+    /** Default constructor */
+    CLDepthConcatenate();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  F32.
+     * @param[out]    output        Output tensor. Data types supported: F32.
+     */
+    void configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::vector<ICLTensor *>                    _inputs_vector;
+    std::unique_ptr<CLDepthConcatenateKernel[]> _concat_kernels_vector;
+    std::unique_ptr<CLFillBorderKernel[]>       _border_handlers_vector;
+    unsigned int                                _num_inputs;
+};
+}
+#endif /* __ARM_COMPUTE_CLDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index 09e4fc9..826f445 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -36,13 +36,44 @@
 
 namespace arm_compute
 {
+/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls the following kernels:
+ *
+ *  -# @ref CLTransposeKernel        (if @p transpose_weights is set to true)
+ *  -# @ref CLGEMMTranspose1xWKernel (if @p is_batched_fc_layer is set to true)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CLFullyConnectedLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    CLFullyConnectedLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input               Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/F32.
+     * @param[out] output              Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights   True if the weights must be transposed. Data types supported: Same as @p weights.
+     * @param[in]  is_batched_fc_layer True if it is a batched fully connected layer
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLTransposeKernel        _transpose_kernel;
+    CLGEMMTranspose1xWKernel _transpose1xW_kernel;
+    CLTensor                 _transpose_output;
+    bool                     _transpose_weights;
+    bool                     _is_batched_fc_layer;
+};
+
 /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
  *
  *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref CLTransposeKernel (if @p transpose_weights is set to true) (called once)
- *  -# @ref NEGEMMTranspose1xWKernel (called once if we have a multi-batch input)
- *  -# @ref NEGEMMInterleave4x4Kernel (called if we have a multi-batch input)
- *  -# @ref NEGEMMMatrixMultiplyKernel
+ *  -# @ref CLFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false) (called once)
+ *  -# @ref CLGEMMInterleave4x4Kernel (called if we have a multi-batch input)
+ *  -# @ref CLGEMMMatrixMultiplyKernel
  *  -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
@@ -54,13 +85,14 @@
     CLFullyConnectedLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input             Source tensor. Data type supported: F16, F32.
-     * @param[in]  weights           Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input
-     * @param[in]  biases            Bias tensor. It can be nullptr. Data type supported:Same as @p input.
-     * @param[out] output            Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  transpose_weights (Optional) Transpose weights if true. Defaults to true.
+     * @param[in]  input                Source tensor. Data type supported: F16/F32.
+     * @param[in]  weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input
+     * @param[in]  biases               Bias tensor. It can be nullptr. Data type supported:Same as @p input.
+     * @param[out] output               Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights    (Optional) Transpose weights if true. Defaults to true.
+     * @param[in]  are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false.
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights = true);
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights = true, bool are_weights_reshaped = false);
 
     //Inherited methods override
     void run() override;
@@ -71,21 +103,18 @@
     void configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
     void configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
 
-    CLIm2ColKernel                     _im2col_kernel;
-    CLTransposeKernel                  _transpose_kernel;
-    CLGEMMTranspose1xWKernel           _transpose1xW_kernel;
-    CLGEMMInterleave4x4Kernel          _interleave4x4_kernel;
-    CLGEMMMatrixMultiplyKernel         _mm_kernel;
-    CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
-    CLTensor                           _im2col_output;
-    CLTensor                           _interleave4x4_output;
-    CLTensor                           _transpose_output;
-    CLTensor                           _transpose1xW_output;
-    bool                               _is_first_run;
-    bool                               _transpose_weights;
-    bool                               _fc_after_conv;
-    bool                               _batched_fc_layer;
-    bool                               _accumulate_biases;
+    CLIm2ColKernel                      _im2col_kernel;
+    CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+    CLGEMMInterleave4x4Kernel           _interleave4x4_kernel;
+    CLGEMMMatrixMultiplyKernel          _mm_kernel;
+    CLGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
+    CLTensor                            _im2col_output;
+    CLTensor                            _interleave4x4_output;
+    CLTensor                            _reshape_weights_output;
+    bool                                _are_weights_reshaped;
+    bool                                _is_fc_after_conv;
+    bool                                _is_batched_fc_layer;
+    bool                                _accumulate_biases;
 };
 }
 #endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
new file mode 100644
index 0000000..cdb23bf
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDESCRIPTOR_H__
+#define __ARM_COMPUTE_CLHOGDESCRIPTOR_H__
+
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class IHOG;
+/** Basic function to calculate HOG descriptor. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLHOGGradient
+ * -# @ref CLHOGOrientationBinningKernel
+ * -# @ref CLHOGBlockNormalizationKernel
+ *
+ */
+class CLHOGDescriptor : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGDescriptor();
+    /** Initialise the function's source, destination, HOG data-object and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block
+     * @param[in]      hog                   HOG data object which describes the HOG descriptor
+     * @param[in]      border_mode           Border mode to use.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLHOGGradient                 _gradient;
+    CLHOGOrientationBinningKernel _orient_bin;
+    CLHOGBlockNormalizationKernel _block_norm;
+    CLTensor                      _mag;
+    CLTensor                      _phase;
+    CLTensor                      _hog_space;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGDESCRIPTOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h
new file mode 100644
index 0000000..0b4fad7
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGDetector.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDETECTOR_H__
+#define __ARM_COMPUTE_CLHOGDETECTOR_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to execute HOG detector based on linear SVM. This function calls the following OpenCL kernel:
+ *
+ * -# @ref CLHOGDetectorKernel
+ *
+ */
+class CLHOGDetector : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGDetector();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetector(const CLHOGDetector &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetector &operator=(const CLHOGDetector &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGDetector(CLHOGDetector &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGDetector &operator=(CLHOGDetector &&) = default;
+    /** Default destructor */
+    ~CLHOGDetector() = default;
+    /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
+     *
+     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
+     *
+     * @param[in]  input                   Input tensor. It is the output of @ref NEHOGDescriptor. Data type supported: F32
+     * @param[in]  hog                     HOG data-object that describes the HOG descriptor
+     * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the block stride stored in hog
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, size_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLHOGDetectorKernel      _hog_detector_kernel;
+    ICLDetectionWindowArray *_detection_windows;
+    cl::Buffer               _num_detection_windows;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGDETECTOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h
new file mode 100644
index 0000000..e74a684
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGGradient.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGGRADIENT_H__
+#define __ARM_COMPUTE_CLHOGGRADIENT_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLDerivative.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic function to calculate the gradient for HOG. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLDerivative
+ * -# @ref CLMagnitudePhaseKernel
+ *
+ */
+class CLHOGGradient : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGGradient();
+    /** Initialise the function's source, destinations, phase type and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8.
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_magnitude      Output tensor (magnitude). Data type supported: U16.
+     * @param[out]     output_phase          Output tensor.(phase). Format supported: U8
+     * @param[in]      phase_type            Type of @ref PhaseType
+     * @param[in]      border_mode           Border mode to use
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLDerivative           _derivative;
+    CLMagnitudePhaseKernel _mag_phase;
+    CLTensor               _gx;
+    CLTensor               _gy;
+};
+}
+#endif /*__ARM_COMPUTE_CLHOGGRADIENT_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
new file mode 100644
index 0000000..3fe0fa9
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGMULTIDETECTION_H__
+#define __ARM_COMPUTE_CLHOGMULTIDETECTION_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following kernels:
+ *
+ * -# @ref CLHOGGradient
+ * -# @ref CLHOGOrientationBinningKernel
+ * -# @ref CLHOGBlockNormalizationKernel
+ * -# @ref CLHOGDetector
+ * -# @ref CPPDetectionWindowNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
+ *
+ * @note This implementation works if all the HOG data-objects within the IMultiHOG container have the same:
+ *       -# Phase type
+         -# Normalization type
+         -# L2 hysteresis threshold if the normalization type is L2HYS_NORM
+ *
+ */
+class CLHOGMultiDetection : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGMultiDetection();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGMultiDetection(const CLHOGMultiDetection &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGMultiDetection &operator=(const CLHOGMultiDetection &) = delete;
+    /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
+     *
+     * @param[in, out] input                    Input tensor. Data type supported: U8
+     *                                          (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      multi_hog                Container of multiple HOG data object. Each HOG data object describes one HOG model to detect.
+     *                                          This container should store the HOG data-objects in descending or ascending cell_size width order.
+     *                                          This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects
+     * @param[out]     detection_windows        Array of @ref DetectionWindow used for locating the detected objects
+     * @param[in]      detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object
+     *                                          The dimension of this array must be the same of multi_hog->num_models()
+     *                                          The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array
+     * @param[in]      border_mode              Border mode to use.
+     * @param[in]      constant_border_value    (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      threshold                (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]      non_maxima_suppression   (Optional) Flag to specify whether the non-maxima suppression is required or not.
+     *                                          True if the non-maxima suppression stage has to be computed
+     * @param[in]      min_distance             (Optional) Radial Euclidean distance to use for the non-maxima suppression stage
+     *
+     */
+    void configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+                   uint8_t constant_border_value = 0,
+                   float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLHOGGradient                                                 _gradient_kernel;
+    std::unique_ptr<CLHOGOrientationBinningKernel[]>              _orient_bin_kernel;
+    std::unique_ptr<CLHOGBlockNormalizationKernel[]>              _block_norm_kernel;
+    std::unique_ptr<CLHOGDetector[]>                              _hog_detect_kernel;
+    std::unique_ptr<CPPDetectionWindowNonMaximaSuppressionKernel> _non_maxima_kernel;
+    std::unique_ptr<CLTensor[]>                                   _hog_space;
+    std::unique_ptr<CLTensor[]>                                   _hog_norm_space;
+    ICLDetectionWindowArray                                      *_detection_windows;
+    CLTensor                                                      _mag;
+    CLTensor                                                      _phase;
+    bool                                                          _non_maxima_suppression;
+    size_t                                                        _num_orient_bin_kernel;
+    size_t                                                        _num_block_norm_kernel;
+    size_t                                                        _num_hog_detect_kernel;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGMULTIDETECTION_H__ */
\ No newline at end of file
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
new file mode 100644
index 0000000..b4e4691
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to compute the locally connected layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLLocallyConnectedLayerWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref CLIm2ColKernel
+ * -# @ref CLLocallyConnectedMatrixMultiplyKernel
+ * -# @ref CLCol2ImKernel
+ */
+class CLLocallyConnectedLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLLocallyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLIm2ColKernel                              _input_im2col_kernel;
+    CLLocallyConnectedLayerWeightsReshapeKernel _weights_reshape_kernel;
+    CLLocallyConnectedMatrixMultiplyKernel      _mm_kernel;
+    CLCol2ImKernel                              _output_col2im_kernel;
+    CLTensor                                    _input_im2col_reshaped;
+    CLTensor                                    _weights_reshaped;
+    CLTensor                                    _gemm_output;
+    bool                                        _is_first_run;
+};
+}
+#endif /* __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h
index 0828af6..7a37e5e 100644
--- a/arm_compute/runtime/CPP/CPPScheduler.h
+++ b/arm_compute/runtime/CPP/CPPScheduler.h
@@ -24,35 +24,28 @@
 #ifndef __ARM_COMPUTE_CPPSCHEDULER_H__
 #define __ARM_COMPUTE_CPPSCHEDULER_H__
 
-#include <cstddef>
+#include "arm_compute/runtime/IScheduler.h"
+
 #include <memory>
 
 namespace arm_compute
 {
-class ICPPKernel;
 class Thread;
 
-/** Pool of threads to automatically split a kernel's execution among several threads. */
-class CPPScheduler
+/** C++11 implementation of a pool of threads to automatically split a kernel's execution among several threads. */
+class CPPScheduler : public IScheduler
 {
-private:
-    /** Constructor: create a pool of threads. */
-    CPPScheduler();
-
 public:
-    /** Force the re-creation of the pool of threads to use the specified number of threads.
+    /** Sets the number of threads the scheduler will use to run the kernels.
      *
-     * @param[in] num_threads If set to 0, then std::thread::hardware_concurrency() threads will be used, otherwise the number of threads specified.
+     * @param[in] num_threads If set to 0, then the maximum number of threads supported by C++11 will be used, otherwise the number of threads specified.
      */
-    void force_number_of_threads(int num_threads);
+    void set_num_threads(unsigned int num_threads) override;
     /** Returns the number of threads that the CPPScheduler has in his pool.
      *
      * @return Number of threads available in CPPScheduler.
      */
-    int num_threads() const
-    {
-        return _num_threads;
-    }
+    unsigned int num_threads() const override;
     /** Access the scheduler singleton
      *
      * @return The scheduler
@@ -65,12 +58,15 @@
      * - The scheduler has been initialized with only one thread.
      *
      * @param[in] kernel          Kernel to execute.
-     * @param[in] split_dimension Dimension along which to split the kernel's execution window (By default 1/Y)
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
      */
-    void multithread(ICPPKernel *kernel, size_t split_dimension = 1);
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
 
 private:
-    int _num_threads;
+    /** Constructor: create a pool of threads. */
+    CPPScheduler();
+
+    unsigned int _num_threads;
     std::unique_ptr<Thread[], void (*)(Thread *)> _threads;
 };
 }
diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h
index 0cd21b9..a4e7ed1 100644
--- a/arm_compute/runtime/IFunction.h
+++ b/arm_compute/runtime/IFunction.h
@@ -36,7 +36,7 @@
      * - Multi-threading is used for the kernels which are parallelisable.
      * - By default std::thread::hardware_concurrency() threads are used.
      *
-     * @note @ref CPPScheduler::force_number_of_threads() can be used to manually set the number of threads
+     * @note @ref CPPScheduler::set_num_threads() can be used to manually set the number of threads
      *
      * For OpenCL kernels:
      * - All the kernels are enqueued on the queue associated with CLScheduler.
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
new file mode 100644
index 0000000..39c027c
--- /dev/null
+++ b/arm_compute/runtime/IScheduler.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ISCHEDULER_H__
+#define __ARM_COMPUTE_ISCHEDULER_H__
+
+namespace arm_compute
+{
+class ICPPKernel;
+
+/** Scheduler interface to run kernels */
+class IScheduler
+{
+public:
+    /** Destructor. */
+    virtual ~IScheduler() = default;
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads If set to 0, then one thread per CPU core available on the system will be used, otherwise the number of threads specified.
+     */
+    virtual void set_num_threads(unsigned int num_threads) = 0;
+    /** Returns the number of threads that the SingleThreadScheduler has in his pool.
+     *
+     * @return Number of threads available in SingleThreadScheduler.
+     */
+    virtual unsigned int num_threads() const = 0;
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    virtual void schedule(ICPPKernel *kernel, unsigned int split_dimension) = 0;
+};
+}
+#endif /* __ARM_COMPUTE_ISCHEDULER_H__ */
diff --git a/arm_compute/runtime/MultiHOG.h b/arm_compute/runtime/MultiHOG.h
index 486ae14..32bad70 100644
--- a/arm_compute/runtime/MultiHOG.h
+++ b/arm_compute/runtime/MultiHOG.h
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/HOG.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 /** CPU implementation of multi HOG data-object */
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index ef17599..daf76f3 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -30,6 +30,7 @@
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
 #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
 #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
@@ -41,9 +42,11 @@
 #include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolution.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
 #include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
 #include "arm_compute/runtime/NEON/functions/NEDerivative.h"
 #include "arm_compute/runtime/NEON/functions/NEDilate.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
 #include "arm_compute/runtime/NEON/functions/NEErode.h"
 #include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
@@ -65,6 +68,7 @@
 #include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
 #include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
diff --git a/arm_compute/runtime/NEON/NEScheduler.h b/arm_compute/runtime/NEON/NEScheduler.h
index c65d6b7..94c82b2 100644
--- a/arm_compute/runtime/NEON/NEScheduler.h
+++ b/arm_compute/runtime/NEON/NEScheduler.h
@@ -24,10 +24,10 @@
 #ifndef __ARM_COMPUTE_NESCHEDULER_H__
 #define __ARM_COMPUTE_NESCHEDULER_H__
 
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
+#include "arm_compute/runtime/Scheduler.h"
 
 namespace arm_compute
 {
-using NEScheduler = CPPScheduler;
+using NEScheduler = Scheduler;
 }
 #endif /*__ARM_COMPUTE_NESCHEDULER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index 3fb3e20..35366e1 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -41,7 +41,7 @@
 public:
     /** Set the input and output tensor.
      *
-     * @param[in]  input           Source tensor. Data type supported: F32.
+     * @param[in]  input           Source tensor. Data type supported: QS8/F32.
      * @param[out] output          Destination tensor. Data type supported: same as @p input
      * @param[in]  activation_info Activation layer parameters.
      */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index 8f66a6d..8e34e98 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -35,11 +35,11 @@
 class NEArithmeticAddition : public INESimpleFunction
 {
 public:
-    /** Initialise the kernel's inputs, output and convertion policy.
+    /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in]  input1 First tensor input. Data types supported: U8 or S16.
-     * @param[in]  input2 Second tensor input. Data types supported: U8 or S16.
-     * @param[out] output Output tensor. Data types supported: U8 or S16.
+     * @param[in]  input1 First tensor input. Data types supported: U8/S16.
+     * @param[in]  input2 Second tensor input. Data types supported: U8/S16.
+     * @param[out] output Output tensor. Data types supported: U8/S16.
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index d0eaff7..841b591 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -35,11 +35,11 @@
 class NEArithmeticSubtraction : public INESimpleFunction
 {
 public:
-    /** Initialise the kernel's inputs, output and convertion policy.
+    /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in]  input1 First tensor input. Data types supported: U8 or S16.
-     * @param[in]  input2 Second tensor input. Data types supported: U8 or S16.
-     * @param[out] output Output tensor. Data types supported: U8 or S16.
+     * @param[in]  input1 First tensor input. Data types supported: U8/S16.
+     * @param[in]  input2 Second tensor input. Data types supported: U8/S16.
+     * @param[out] output Output tensor. Data types supported: U8/S16.
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
new file mode 100644
index 0000000..b0b5c12
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NENormalizationLayerKernel and simulate a batch normalization layer.
+ *
+ * Batch normalization is calculated by:
+ * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
+ *
+ */
+class NEBatchNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NEBatchNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: QS8/F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEBatchNormalizationLayerKernel _norm_kernel; /**< Batch normalization layer kernel */
+};
+}
+#endif /* __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolution.h b/arm_compute/runtime/NEON/functions/NEConvolution.h
index 5c80977..1704d9f 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolution.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolution.h
@@ -49,7 +49,7 @@
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[out]    output                Destination tensor, Data types supported: U8/S16.
      * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
      * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
      * @param[in]     border_mode           Strategy to use for borders.
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index a6862ca..a8fff8d 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -27,12 +27,12 @@
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvolutionLayerWeightsReshapeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -40,11 +40,38 @@
 {
 class ITensor;
 
-/** Basic function to simulate a convolution layer. This function calls the following OpenCL kernels:
- * -# @ref NEConvolutionLayerWeightsReshapeKernel (executed only once for each configuration)
- * -# @ref NEGEMMTranspose1xWKernel               (executed only once for each configuration)
+/** Function to reshape and perform 1xW transposition on the weights. This function calls the following kernels:
+ * -# @ref NEWeightsReshapeKernel
+ * -# @ref NEGEMMTranspose1xWKernel (executed in case GEMM is required for the operation)
+ */
+class NEConvolutionLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    NEConvolutionLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QS8/F32.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] output       Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
+     *                          Data types supported: Same as @p weights.
+     */
+    void configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEWeightsReshapeKernel   _weights_reshape_kernel;
+    NEGEMMTranspose1xWKernel _weights_transposed_kernel;
+    Tensor                   _weights_reshaped;
+    bool                     _transpose1xW;
+};
+
+/** Basic function to simulate a convolution layer. This function calls the following NEON kernels:
+ * -# @ref NEWeightsReshapeKernel   (executed only once for each configuration)
  * -# @ref NEIm2ColKernel
- * -# @ref NEGEMMInterleave4x4Kernel
+ * -# @ref NEGEMMInterleave4x4Kernel (executed only in case GEMM is required for the operation)
  * -# @ref NEGEMMMatrixMultiplyKernel
  * -# @ref NECol2ImKernel
  */
@@ -55,34 +82,34 @@
     NEConvolutionLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                          while every optional dimension from 4 and above represent a batch of inputs.
+     *                          Data types supported: QS8/F32.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                          Data types supported: Same as @p input.
+     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info);
-
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo());
     // Inherited methods overridden:
     void run() override;
 
 private:
-    NEIm2ColKernel                         _input_im2col_kernel;
-    NEGEMMInterleave4x4Kernel              _input_interleave_kernel;
-    NEConvolutionLayerWeightsReshapeKernel _weights_reshape_kernel;
-    NEGEMMTranspose1xWKernel               _weights_transposed_kernel;
-    NEGEMMMatrixMultiplyKernel             _mm_kernel;
-    NECol2ImKernel                         _output_col2im_kernel;
-    Tensor                                 _input_im2col_reshaped;
-    Tensor                                 _input_interleaved_reshaped;
-    Tensor                                 _weights_reshaped;
-    Tensor                                 _weights_transposed;
-    Tensor                                 _gemm_output;
-    bool                                   _is_first_run;
-    bool                                   _has_bias;
+    NEIm2ColKernel                   _input_im2col_kernel;
+    NEGEMMInterleave4x4Kernel        _input_interleave_kernel;
+    NEConvolutionLayerReshapeWeights _reshape_weights;
+    NEGEMMMatrixMultiplyKernel       _mm_kernel;
+    NECol2ImKernel                   _output_col2im_kernel;
+    Tensor                           _input_im2col_reshaped;
+    Tensor                           _input_interleaved_reshaped;
+    Tensor                           _weights_reshaped;
+    Tensor                           _gemm_output;
+    bool                             _has_bias;
+    bool                             _is_fully_connected_convolution;
+    bool                             _are_weights_reshaped;
 };
 }
 #endif /* __ARM_COMPUTE_NECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
new file mode 100644
index 0000000..02ff122
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHCONCATENATE_H__
+#define __ARM_COMPUTE_NEDEPTHCONCATENATE_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+class NEDepthConcatenateKernel;
+class NEFillBorderKernel;
+
+/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
+ * -# @ref NEDepthConcatenateKernel
+ *
+ */
+class NEDepthConcatenate : public IFunction
+{
+public:
+    /** Default constructor */
+    NEDepthConcatenate();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  F32.
+     * @param[out]    output        Output tensor. Data types supported: F32.
+     */
+    void configure(std::vector<ITensor *> inputs_vector, ITensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::vector<ITensor *>                      _inputs_vector;
+    std::unique_ptr<NEDepthConcatenateKernel[]> _concat_kernels_vector;
+    std::unique_ptr<NEFillBorderKernel[]>       _border_handlers_vector;
+    unsigned int                                _num_inputs;
+};
+}
+#endif /* __ARM_COMPUTE_NEDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvert.h b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
index 21ccca3..7c59ce4 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvert.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
@@ -48,17 +48,18 @@
      * Input format must be different than output format.
      *
      * Valid conversions Input -> Output :
-     *    U8 -> U16, S16, U32, S32
-     *    U16 -> U8, U32, S32
-     *    S16 -> U8, U32, S32
-     *    U32 -> U8, U16, S16
-     *    S32 -> U8, U16, S16
+     *    QS8 -> F32
+     *    U8 -> U16, S16, S32
+     *    U16 -> U8, U32
+     *    S16 -> U8, S32
+     *    F32 -> QS8
      *
      *
-     * @param[in]  input  The input tensor to convert. Data type supported: U8, U16, S16, U32 or S32.
-     * @param[out] output The output tensor. Data type supported: U8, U16, S16, U32 or S32.
+     * @param[in]  input  The input tensor to convert. Data type supported: QS8/U8/U16/S16/F32.
+     * @param[out] output The output tensor. Data type supported: QS8/U8/U16/S16/U32/S32/F32.
      * @param[in]  policy Conversion policy.
      * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     *                    It is not used on fixed point conversion.
      */
     void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift);
 };
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
new file mode 100644
index 0000000..a356cac
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Function to run the direct convolution.
+ *
+ *  This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel for the input
+ * -# @ref NEDirectConvolutionLayerBiasAccumulateKernel
+ * -# @ref NEDirectConvolutionLayerKernel
+ */
+class NEDirectConvolutionLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEDirectConvolutionLayer();
+    /** Set the input, weights, biases and output tensors.
+      *
+      * @param[in, out] input     Input tensor. Data types supported: QS8/F32.
+      * @param[in]      weights   Set of kernels to convolve the input volume.
+      *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
+      *                           Data type supported: Same as @p input.
+      * @param[in]      bias      Set of biases. Data type supported: Same as @p input.
+      * @param[out]     output    Output tensor.
+      *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
+      */
+    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEDirectConvolutionLayerBiasAccumulateKernel _accumulate_bias_kernel;
+    NEDirectConvolutionLayerKernel               _conv_kernel;
+    NEFillBorderKernel                           _input_border_handler;
+    Tensor                                       _accumulator;
+};
+}
+#endif /* __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
index c69c285..b6b7e77 100644
--- a/arm_compute/runtime/NEON/functions/NEFillBorder.h
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h
@@ -41,7 +41,7 @@
      *
      * @note This function fills the borders within the XY-planes.
      *
-     * @param[in, out] input                 Source tensor. Data type supported: U8, S16, S32, F32
+     * @param[in, out] input                 Source tensor. Data type supported: U8/QS8/S16/S32/F32
      * @param[in]      border_width          Width of the tensor border in pixels.
      * @param[in]      border_mode           Strategy to use for borders.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 69e27b8..33ec4ef 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -36,10 +36,41 @@
 
 namespace arm_compute
 {
+/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls the following kernels:
+ *
+ *  -# @ref NETransposeKernel        (if @p transpose_weights is set to true)
+ *  -# @ref NEGEMMTranspose1xWKernel (if @p is_batched_fc_layer is set to true)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    NEFullyConnectedLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input               Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/F32.
+     * @param[out] output              Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights   True if the weights must be transposed. Data types supported: Same as @p weights.
+     * @param[in]  is_batched_fc_layer True if it is a batched fully connected layer
+     */
+    void configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NETransposeKernel        _transpose_kernel;
+    NEGEMMTranspose1xWKernel _transpose1xW_kernel;
+    Tensor                   _transpose_output;
+    bool                     _transpose_weights;
+    bool                     _is_batched_fc_layer;
+};
+
 /** Basic function to compute a Fully Connected layer on NEON. This function calls the following NEON kernels:
- *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref NETransposeKernel (if @p transpose_weights flag is set to true) (called once)
- *  -# @ref NEGEMMTranspose1xWKernel (called once if we have a multi-batch input)
+ *  -# @ref NEIm2ColKernel                      (called when the input comes from a convolutional layer)
+ *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped flag is set to false) (called once)
  *  -# @ref NEGEMMInterleave4x4Kernel (called if we have a multi-batch input)
  *  -# @ref NEGEMMMatrixMultiplyKernel
  *  -# @ref NEGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
@@ -53,13 +84,14 @@
     NEFullyConnectedLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input             Source tensor. Data type supported: F32.
-     * @param[in]  weights           Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input.
-     * @param[in]  biases            Bias tensor. Can be nullptr. Data type supported:Same as @p input.
-     * @param[out] output            Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  transpose_weights (Optional) Transpose weights if true. Defaults to true.
+     * @param[in]  input                Source tensor. Data type supported: QS8/F32.
+     * @param[in]  weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input.
+     * @param[in]  biases               Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+     * @param[out] output               Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights    (Optional) Transpose the weights tensor if true. Defaults to true.
+     * @param[in]  are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false.
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights = true);
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights = true, bool are_weights_reshaped = false);
 
     //Inherited methods override
     void run() override;
@@ -70,21 +102,18 @@
     void configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output);
     void configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output);
 
-    NEIm2ColKernel                     _im2col_kernel;
-    NETransposeKernel                  _transpose_kernel;
-    NEGEMMTranspose1xWKernel           _transpose1xW_kernel;
-    NEGEMMInterleave4x4Kernel          _interleave4x4_kernel;
-    NEGEMMMatrixMultiplyKernel         _mm_kernel;
-    NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
-    Tensor                             _im2col_output;
-    Tensor                             _interleave4x4_output;
-    Tensor                             _transpose_output;
-    Tensor                             _transpose1xW_output;
-    bool                               _is_first_run;
-    bool                               _transpose_weights;
-    bool                               _fc_after_conv;
-    bool                               _batched_fc_layer;
-    bool                               _accumulate_biases;
+    NEIm2ColKernel                      _im2col_kernel;
+    NEFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+    NEGEMMInterleave4x4Kernel           _interleave4x4_kernel;
+    NEGEMMMatrixMultiplyKernel          _mm_kernel;
+    NEGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
+    Tensor                              _im2col_output;
+    Tensor                              _interleave4x4_output;
+    Tensor                              _reshape_weights_output;
+    bool                                _are_weights_reshaped;
+    bool                                _is_fc_after_conv;
+    bool                                _is_batched_fc_layer;
+    bool                                _accumulate_biases;
 };
 }
 #endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index b9346e7..a40aa91 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -50,9 +50,9 @@
     /** Initialise the kernel's inputs, output
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
-     * @note GEMM: The tensors a, b, c, d must have the same data type. All are either F32 or F16. You should not mix data types when calling this function.
+     * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
      *
-     * @param[in]  a     First input tensor  (Matrix A or Vector A). Data type supported: F32, F16.
+     * @param[in]  a     First input tensor  (Matrix A or Vector A). Data type supported: QS8/F16/F32
      * @param[in]  b     Second input tensor (Matrix B). Data type supported: same as @p a
      * @param[in]  c     Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a
      * @param[out] d     Output tensor. Data type supported: same as @p a
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
index 71fefbf..b911fd0 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
@@ -40,7 +40,7 @@
 public:
     /** Initialise the kernel's inputs, output
      *
-     * @param[in]  input  First input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  First input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
index 69096fb..447b8c9 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
@@ -38,7 +38,7 @@
 public:
     /** Initialise the kernel's inputs, output
      *
-     * @param[in]  input  First input tensor. Data type supported: F32, F16, U8.
+     * @param[in]  input  First input tensor. Data type supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32/
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
index 7487f66..699e42e 100644
--- a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
+++ b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
@@ -64,7 +64,7 @@
 protected:
     NEGaussian5x5HorKernel  _kernel_hor;     /**< kernel for horizontal pass */
     NEGaussian5x5VertKernel _kernel_vert;    /**< kernel for vertical pass */
-    Tensor                  _tmp;            /** temporary buffer for output of horizontal pass */
+    Tensor                  _tmp;            /**< temporary buffer for output of horizontal pass */
     NEFillBorderKernel      _border_handler; /**< kernel to handle tensor borders */
 };
 }
diff --git a/arm_compute/runtime/NEON/functions/NEHOGDetector.h b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
index 46ab72c..98b8a89 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGDetector.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
@@ -40,6 +40,8 @@
 public:
     /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
      *
+     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
+     *
      * @param[in]  input                   Input tensor. It is the output of @ref NEHOGDescriptor. Data type supported: F32
      * @param[in]  hog                     HOG data-object that describes the HOG descriptor
      * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
diff --git a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
index 9440ee0..2d07e64 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
@@ -24,10 +24,10 @@
 #ifndef __ARM_COMPUTE_NEHOGMULTIDETECTION_H__
 #define __ARM_COMPUTE_NEHOGMULTIDETECTION_H__
 
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/core/IMultiHOG.h"
 #include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHOGNonMaximaSuppressionKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
 #include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
@@ -41,7 +41,7 @@
  * -# @ref NEHOGOrientationBinningKernel
  * -# @ref NEHOGBlockNormalizationKernel
  * -# @ref NEHOGDetector
- * -# @ref NEHOGNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
+ * -# @ref CPPDetectionWindowNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
  *
  * @note This implementation works if all the HOG data-objects within the IMultiHOG container have the same:
  *       -# Phase type
@@ -85,20 +85,20 @@
     void run() override;
 
 private:
-    NEHOGGradient                                    _gradient_kernel;
-    std::unique_ptr<NEHOGOrientationBinningKernel[]> _orient_bin_kernel;
-    std::unique_ptr<NEHOGBlockNormalizationKernel[]> _block_norm_kernel;
-    std::unique_ptr<NEHOGDetector[]>                 _hog_detect_kernel;
-    std::unique_ptr<NEHOGNonMaximaSuppressionKernel> _non_maxima_kernel;
-    std::unique_ptr<Tensor[]>                        _hog_space;
-    std::unique_ptr<Tensor[]>                        _hog_norm_space;
-    IDetectionWindowArray                           *_detection_windows;
-    Tensor                                           _mag;
-    Tensor                                           _phase;
-    bool                                             _non_maxima_suppression;
-    size_t                                           _num_orient_bin_kernel;
-    size_t                                           _num_block_norm_kernel;
-    size_t                                           _num_hog_detect_kernel;
+    NEHOGGradient                                                 _gradient_kernel;
+    std::unique_ptr<NEHOGOrientationBinningKernel[]>              _orient_bin_kernel;
+    std::unique_ptr<NEHOGBlockNormalizationKernel[]>              _block_norm_kernel;
+    std::unique_ptr<NEHOGDetector[]>                              _hog_detect_kernel;
+    std::unique_ptr<CPPDetectionWindowNonMaximaSuppressionKernel> _non_maxima_kernel;
+    std::unique_ptr<Tensor[]>                                     _hog_space;
+    std::unique_ptr<Tensor[]>                                     _hog_norm_space;
+    IDetectionWindowArray                                        *_detection_windows;
+    Tensor                                                        _mag;
+    Tensor                                                        _phase;
+    bool                                                          _non_maxima_suppression;
+    size_t                                                        _num_orient_bin_kernel;
+    size_t                                                        _num_block_norm_kernel;
+    size_t                                                        _num_hog_detect_kernel;
 };
 }
 
diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
new file mode 100644
index 0000000..1b2b2ee
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class INETensor;
+
+/** Basic function to compute the locally connected layer. This function calls the following NEON kernels:
+ *
+ * -# @ref NEWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref NEIm2ColKernel
+ * -# @ref NELocallyConnectedMatrixMultiplyKernel
+ * -# @ref NECol2ImKernel
+ */
+class NELocallyConnectedLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NELocallyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEIm2ColKernel                         _input_im2col_kernel;
+    NEWeightsReshapeKernel                 _weights_reshape_kernel;
+    NELocallyConnectedMatrixMultiplyKernel _mm_kernel;
+    NECol2ImKernel                         _output_col2im_kernel;
+    Tensor                                 _input_im2col_reshaped;
+    Tensor                                 _weights_reshaped;
+    Tensor                                 _gemm_output;
+    bool                                   _is_first_run;
+};
+}
+#endif /* __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
index e60349a..82e75ee 100644
--- a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
+++ b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
@@ -48,7 +48,7 @@
     NEMinMaxLocation();
     /** Initialise the kernel's inputs and outputs.
      *
-     * @param[in]  input     Input image. Data types supported: U8 or S16.
+     * @param[in]  input     Input image. Data types supported: U8/S16.
      * @param[out] min       Minimum value of image.
      * @param[out] max       Maximum value of image.
      * @param[out] min_loc   (Optional) Array of minimum value locations.
diff --git a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
index 06e4f08..c87d722 100644
--- a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
+++ b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
@@ -45,7 +45,7 @@
      * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
      *       The constant values used with CONSTANT border mode is 0
      *
-     * @param[in, out] input       Source tensor. Data type supported: U8, F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in, out] input       Source tensor. Data type supported: U8/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[out]     output      Destination for the Non-Maxima suppressions 3x3. Data type supported: same as @p input
      * @param[in]      border_mode Border mode to use for non-maxima suppression. The implementation supports just 2 border modes: UNDEFINED and CONSTANT
      *
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index b7be34d..3202867 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -52,7 +52,7 @@
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                       and an optional 4th dimension for batch of inputs. Data type supported: F32. Number of channels must be 1.
+     *                       and an optional 4th dimension for batch of inputs. Data type supported: QS8/F32
      * @param[out] output    Destination with the same dimensions, data type and number of channels of  @p input
      * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
      */
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 835bd13..de7a797 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -37,9 +37,9 @@
 public:
     /** Initialise the kernel's inputs, output and convertion policy.
      *
-     * @param[in]  input1          First tensor input. Data types supported: U8 or S16.
-     * @param[in]  input2          Second tensor input. Data types supported: U8 or S16.
-     * @param[out] output          Output tensor. Data types supported: U8 or S16.
+     * @param[in]  input1          First tensor input. Data types supported: U8/QS8/S16/F32.
+     * @param[in]  input2          Second tensor input. Data types supported: U8/QS8/S16/F32.
+     * @param[out] output          Output tensor. Data types supported: U8/QS8/S16/F32.
      * @param[in]  scale           Scale to apply after multiplication. Must be positive.
      * @param[in]  overflow_policy Overflow policy.
      * @param[in]  rounding_policy Rounding policy.
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 5d67830..5a9cffa 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -42,7 +42,7 @@
 public:
     /** Set the input and output tensors.
      *
-     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: F32.
+     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/F32.
      * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index c67b667..dc84dec 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -50,7 +50,7 @@
     NESoftmaxLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: F32.
+     * @param[in]  input  Source tensor. Data types supported: QS8/F32.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
      */
     void configure(ITensor *input, ITensor *output);
@@ -63,7 +63,6 @@
     NELogits1DShiftExpSumKernel _shift_exp_sum_kernel;
     NELogits1DNormKernel        _norm_kernel;
     NEFillBorderKernel          _fill_border_kernel;
-    NEFillBorderKernel          _fill_border_kernel_sum;
     Tensor                      _max;
     Tensor                      _sum;
     Tensor                      _tmp;
diff --git a/arm_compute/runtime/NEON/functions/NETableLookup.h b/arm_compute/runtime/NEON/functions/NETableLookup.h
index d2f9d30..b59ffb8 100644
--- a/arm_compute/runtime/NEON/functions/NETableLookup.h
+++ b/arm_compute/runtime/NEON/functions/NETableLookup.h
@@ -37,9 +37,9 @@
 public:
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]  input  First tensor input. Data types supported: U8 and S16
+     * @param[in]  input  First tensor input. Data types supported: U8/S16
      * @param[in]  lut    Input lookup table.
-     * @param[out] output Output tensor. Data types supported: U8 and S16.
+     * @param[out] output Output tensor. Data types supported: same as @p input
      */
     void configure(const ITensor *input, const ILut *lut, ITensor *output);
 };
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
index 1b88715..4b606e7 100644
--- a/arm_compute/runtime/NEON/functions/NETranspose.h
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -41,7 +41,7 @@
 public:
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
new file mode 100644
index 0000000..21df6a6
--- /dev/null
+++ b/arm_compute/runtime/OMP/OMPScheduler.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_OMPSCHEDULER_H__
+#define __ARM_COMPUTE_OMPSCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+
+namespace arm_compute
+{
+/** Pool of threads to automatically split a kernel's execution among several threads. */
+class OMPScheduler : public IScheduler
+{
+public:
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads If set to 0, then the number returned by omp_get_max_threads() will be used, otherwise the number of threads specified.
+     */
+    void set_num_threads(unsigned int num_threads) override;
+    /** Returns the number of threads that the OMPScheduler has in its pool.
+     *
+     * @return Number of threads available in OMPScheduler.
+     */
+    unsigned int num_threads() const override;
+    /** Access the scheduler singleton
+     *
+     * @return The scheduler
+     */
+    static OMPScheduler &get();
+    /** Multithread the execution of the passed kernel if possible.
+     *
+     * The kernel will run on a single thread if any of these conditions is true:
+     * - ICPPKernel::is_parallelisable() returns false
+     * - The scheduler has been initialized with only one thread.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
+
+private:
+    /** Constructor. */
+    OMPScheduler();
+
+    unsigned int _num_threads;
+};
+}
+#endif /* __ARM_COMPUTE_OMPSCHEDULER_H__ */
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
new file mode 100644
index 0000000..21f944b
--- /dev/null
+++ b/arm_compute/runtime/Scheduler.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SCHEDULER_H__
+#define __ARM_COMPUTE_SCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Configurable scheduler which supports multiple multithreading APIs and choosing between different schedulers at runtime. */
+class Scheduler
+{
+public:
+    enum class Type
+    {
+        ST,    // Single thread.
+        CPP,   // C++11 threads.
+        OMP,   // OpenMP.
+        CUSTOM // Provided by the user.
+    };
+    /** Sets the user defined scheduler and makes it the active scheduler.
+     *
+     * @param[in] scheduler A shared pointer to a custom scheduler implemented by the user.
+     */
+    static void set(std::shared_ptr<IScheduler> &scheduler);
+    /** Access the scheduler singleton.
+     *
+     * @return A reference to the scheduler object.
+     */
+    static IScheduler &get();
+    /** Set the active scheduler.
+     *
+     * Only one scheduler can be enabled at any time.
+     *
+     * @param[in] t the type of the scheduler to be enabled.
+     */
+    static void set(Type t);
+    /** Returns the type of the active scheduler.
+     *
+     * @return The current scheduler's type.
+     */
+    static Type get_type();
+    /** Returns true if the given scheduler type is supported. False otherwise.
+     *
+     * @return true if the given scheduler type is supported. False otherwise.
+     */
+    static bool is_available(Type t);
+
+private:
+    static Type                        _scheduler_type;
+    static std::shared_ptr<IScheduler> _custom_scheduler;
+    Scheduler();
+};
+}
+#endif /* __ARM_COMPUTE_SCHEDULER_H__ */
diff --git a/arm_compute/runtime/SingleThreadScheduler.h b/arm_compute/runtime/SingleThreadScheduler.h
new file mode 100644
index 0000000..a6e1def
--- /dev/null
+++ b/arm_compute/runtime/SingleThreadScheduler.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__
+#define __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+
+namespace arm_compute
+{
+/** Pool of threads to automatically split a kernel's execution among several threads. */
+class SingleThreadScheduler : public IScheduler
+{
+public:
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads This is ignored for this scheduler as the number of threads is always one.
+     */
+    void set_num_threads(unsigned int num_threads) override;
+    /** Returns the number of threads that the SingleThreadScheduler has, which is always 1.
+     *
+     * @return Number of threads available in SingleThreadScheduler.
+     */
+    unsigned int num_threads() const override;
+    /** Access the scheduler singleton
+     *
+     * @return The scheduler
+     */
+    static SingleThreadScheduler &get();
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
+
+private:
+    /** Constructor. */
+    SingleThreadScheduler() = default;
+};
+}
+#endif /* __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__ */
diff --git a/arm_compute/runtime/SubTensor.h b/arm_compute/runtime/SubTensor.h
new file mode 100644
index 0000000..bdb229d
--- /dev/null
+++ b/arm_compute/runtime/SubTensor.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SUBTENSOR_H__
+#define __ARM_COMPUTE_SUBTENSOR_H__
+
+#include "arm_compute/core/SubTensorInfo.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorInfo;
+
+/** Basic implementation of the sub-tensor interface */
+class SubTensor : public ITensor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] parent       Parent tensor
+     * @param[in] tensor_shape Shape of the subtensor
+     * @param[in] coords       Coordinates of the first subtensor element inside the parent tensor.
+     */
+    SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    /** Destructor: free the tensor's memory */
+    ~SubTensor() = default;
+    /** Restrict instances of this class to be copy constructed */
+    SubTensor(const SubTensor &) = delete;
+    /** Restrict instances of this class to be copied */
+    SubTensor &operator=(const SubTensor &) = delete;
+    /** Allow instances of this class to be move constructed */
+    SubTensor(SubTensor &&) = default;
+    /** Allow instances of this class to be moved */
+    SubTensor &operator=(SubTensor &&) = default;
+    /** Return the parent tensor of the subtensor
+     *
+     * @return Parent tensor
+     */
+    ITensor *parent();
+
+    // Inherited methods overridden:
+    ITensorInfo *info() const override;
+    ITensorInfo *info() override;
+    uint8_t     *buffer() const override;
+
+private:
+    ITensor              *_parent;
+    mutable SubTensorInfo _info;
+};
+}
+#endif /*__ARM_COMPUTE_SUBTENSOR_H__ */
diff --git a/arm_compute/runtime/Tensor.h b/arm_compute/runtime/Tensor.h
index e491635..1fe73a2 100644
--- a/arm_compute/runtime/Tensor.h
+++ b/arm_compute/runtime/Tensor.h
@@ -31,7 +31,7 @@
 
 namespace arm_compute
 {
-class TensorInfo;
+class ITensorInfo;
 
 /** Basic implementation of the tensor interface */
 class Tensor : public ITensor
@@ -52,9 +52,9 @@
     TensorAllocator *allocator();
 
     // Inherited methods overridden:
-    TensorInfo *info() const override;
-    TensorInfo *info() override;
-    uint8_t    *buffer() const override;
+    ITensorInfo *info() const override;
+    ITensorInfo *info() override;
+    uint8_t     *buffer() const override;
 
 private:
     mutable TensorAllocator _allocator; /**< Instance of the basic CPU allocator.*/
diff --git a/arm_compute/runtime/Utils.h b/arm_compute/runtime/Utils.h
new file mode 100644
index 0000000..2f037a0
--- /dev/null
+++ b/arm_compute/runtime/Utils.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_RUNTIME_UTILS_H__
+#define __ARM_COMPUTE_RUNTIME_UTILS_H__
+
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <string>
+
+namespace arm_compute
+{
+/** Convert a Scheduler::Type into a string.
+ *
+ * @param[in] t @ref Scheduler::Type to be translated to string.
+ *
+ * @return The string describing the scheduler type.
+ */
+const std::string &string_from_scheduler_type(Scheduler::Type t);
+}
+#endif /* __ARM_COMPUTE_RUNTIME_UTILS_H__ */