arm_compute v17.10

Change-Id: If1489af40eccd0219ede8946577afbf04db31b29
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index e165cf3..6e5e802 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -569,7 +569,7 @@
 };
 
 CLKernelLibrary::CLKernelLibrary()
-    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map(), _max_workgroup_size(0)
+    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
 {
 }
 
@@ -709,19 +709,18 @@
     return program_source_it->second;
 }
 
-size_t CLKernelLibrary::max_local_workgroup_size()
+size_t CLKernelLibrary::max_local_workgroup_size(const cl::Kernel &kernel) const
 {
-    if(_max_workgroup_size == 0)
-    {
-        size_t err = clGetDeviceInfo(_device.get(), CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &_max_workgroup_size, nullptr);
-        ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
-        ARM_COMPUTE_UNUSED(err);
-    }
+    size_t result;
 
-    return _max_workgroup_size;
+    size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
+    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+    ARM_COMPUTE_UNUSED(err);
+
+    return result;
 }
 
-cl::NDRange CLKernelLibrary::default_ndrange()
+cl::NDRange CLKernelLibrary::default_ndrange() const
 {
-    return cl::NDRange(std::min<size_t>(_max_workgroup_size, 128u), 1);
+    return cl::NDRange(128u, 1);
 }
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 1e04f00..17b58b7 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -52,18 +52,28 @@
                     (window.y().end() - window.y().start()) / window.y().step(),
                     (window.z().end() - window.z().start()) / window.z().step());
 
+    cl::NDRange valid_lws;
+    if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
+    {
+        valid_lws = cl::NullRange;
+    }
+    else
+    {
+        valid_lws = lws_hint;
+    }
+
     cl::NDRange lws = cl::NullRange;
 
-    if((lws_hint[0] <= gws[0]) && (lws_hint[1] <= gws[1]) && (lws_hint[2] <= gws[2]))
+    if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
     {
-        lws = lws_hint;
+        lws = valid_lws;
     }
 
     queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws);
 }
 
 ICLKernel::ICLKernel()
-    : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id)
+    : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0)
 {
 }
 
@@ -163,3 +173,12 @@
 {
     return _target;
 }
+
+size_t ICLKernel::get_max_workgroup_size()
+{
+    if(_max_workgroup_size == 0)
+    {
+        _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel);
+    }
+    return _max_workgroup_size;
+}
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 1d04f39..287c5e2 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -100,6 +100,8 @@
     clGetDeviceInfo           = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo"));
     clGetDeviceIDs            = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
     clRetainEvent             = reinterpret_cast<clRetainEvent_func>(dlsym(handle, "clRetainEvent"));
+    clGetPlatformIDs          = reinterpret_cast<clGetPlatformIDs_func>(dlsym(handle, "clGetPlatformIDs"));
+    clGetKernelWorkGroupInfo  = reinterpret_cast<clGetKernelWorkGroupInfo_func>(dlsym(handle, "clGetKernelWorkGroupInfo"));
 
     dlclose(handle);
 
@@ -632,3 +634,37 @@
         return CL_OUT_OF_RESOURCES;
     }
 }
+
+cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetPlatformIDs;
+    if(func != nullptr)
+    {
+        return func(num_entries, platforms, num_platforms);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int
+clGetKernelWorkGroupInfo(cl_kernel                 kernel,
+                         cl_device_id              device,
+                         cl_kernel_work_group_info param_name,
+                         size_t                    param_value_size,
+                         void                     *param_value,
+                         size_t                   *param_value_size_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo;
+    if(func != nullptr)
+    {
+        return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index b7423d8..f7aa5eb 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -44,7 +44,7 @@
 
 /** Apply batch normalization.
  *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QS8/QS16/F32
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QS8/QS16/F16/F32
  * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
@@ -100,7 +100,7 @@
     Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    _in = 0;
+    data = 0;
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     denominator = 0;
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -114,13 +114,13 @@
 
     const int current_slice = get_global_id(2);
 
-    _in         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
     denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
-    denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(epsilon)));
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
 
     // Calculate x bar and store results
     numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
-    numerator = SUB_OP(_in, numerator);
+    numerator = SUB_OP(data, numerator);
     x_bar     = MUL_OP(numerator, denominator);
 
     gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * beta.stride_x));
diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
index f5ec85a..01d8b90 100644
--- a/src/core/CL/cl_kernels/color_convert.cl
+++ b/src/core/CL/cl_kernels/color_convert.cl
@@ -54,7 +54,7 @@
     uchar16 rgb_2 = vload16(0, in.ptr + 32);
 
     uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
-    uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.f, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
+    uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.sf, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
     uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
     uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
 
diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl
index fb516dd..7b73b85 100644
--- a/src/core/CL/cl_kernels/direct_convolution1x1.cl
+++ b/src/core/CL/cl_kernels/direct_convolution1x1.cl
@@ -191,7 +191,7 @@
 
     weights.ptr += z_index * weights_stride_w;
 
-    for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
     {
         DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
         VEC_DATA_TYPE(DATA_TYPE, 8)
diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl
index d094eca..1420d7c 100644
--- a/src/core/CL/cl_kernels/direct_convolution3x3.cl
+++ b/src/core/CL/cl_kernels/direct_convolution3x3.cl
@@ -138,7 +138,7 @@
     const int kernel_index = get_global_id(2);
     weights_addr += kernel_index * weights_stride_w;
 
-    for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
     {
         CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
         CONVOLUTION1x3(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
index 496da97..6fdd019 100644
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ b/src/core/CL/cl_kernels/direct_convolution5x5.cl
@@ -127,7 +127,7 @@
     const int kernel_index = get_global_id(2);
     weights_addr += kernel_index * weights_stride_w;
 
-    for(int d = 0; d < WEIGHTS_DEPTH; ++d)
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
     {
         CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
         CONVOLUTION1x5(pixels0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 68af64e..4421e74 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -25,6 +25,9 @@
 #define ARM_COMPUTE_HELPER_H
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#if defined(ARM_COMPUTE_DEBUG_ENABLED)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
 
 #define EXPAND(x) x
 
diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl
index 0a4748f..f41821c 100644
--- a/src/core/CL/cl_kernels/warp_affine.cl
+++ b/src/core/CL/cl_kernels/warp_affine.cl
@@ -84,7 +84,7 @@
 {
     Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
     Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
+    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
 }
 
 /** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
@@ -116,5 +116,5 @@
 {
     Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
     Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(bilinear_interpolate(&in, clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), width, height), 0, out.ptr);
+    vstore4(bilinear_interpolate(&in, apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), 0, out.ptr);
 }
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 18c0c97..43f39f4 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -45,7 +45,7 @@
 void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
                                                 float epsilon)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
     _input   = input;
     _output  = output;
diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
index 6c9f83f..11f8e33 100644
--- a/src/core/CL/kernels/CLFloorKernel.cpp
+++ b/src/core/CL/kernels/CLFloorKernel.cpp
@@ -44,10 +44,8 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    set_data_type_if_unknown(*input->info(), DataType::F32);
-    set_data_type_if_unknown(*output->info(), DataType::F32);
+    // Auto initialize output
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
 
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
index e549dbc..be095f2 100644
--- a/src/core/CL/kernels/CLWarpAffineKernel.cpp
+++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp
@@ -88,8 +88,11 @@
 
     Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+    int       total_right  = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
+    const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
+
+    AccessWindowStatic     input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, input_access, output_access);
 
diff --git a/src/core/Logger.cpp b/src/core/Logger.cpp
new file mode 100644
index 0000000..9c3bf26
--- /dev/null
+++ b/src/core/Logger.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Logger.h"
+
+using namespace arm_compute;
+
+Logger::Logger()
+    : _ostream(&std::cout), _nullstream(nullptr), _verbosity(LoggerVerbosity::NONE)
+{
+}
+
+Logger &Logger::get()
+{
+    static Logger _instance;
+    return _instance;
+}
+
+void Logger::set_logger(std::ostream &ostream, LoggerVerbosity verbosity)
+{
+    _ostream   = &ostream;
+    _verbosity = verbosity;
+}
+
+std::ostream &Logger::log_info()
+{
+    if(_verbosity == LoggerVerbosity::INFO)
+    {
+        return *_ostream;
+    }
+    else
+    {
+        return _nullstream;
+    }
+}
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index c8e1113..2766d69 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -203,7 +203,7 @@
     vst1q_qs16(p, v);
 }
 
-inline void internal_vst1q(int *p, const qint32x4x2_t &v)
+inline void internal_vst1q(int32_t *p, const qint32x4x2_t &v)
 {
     vst1q_s32(p, v.val[0]);
     vst1q_s32(p + 4, v.val[1]);
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 9505a25..593a529 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -99,7 +99,7 @@
 } // namespace arm_compute
 
 NEFillBorderKernel::NEFillBorderKernel()
-    : _tensor(nullptr), _border_size(0), _mode(BorderMode::UNDEFINED), _constant_border_value(0)
+    : _tensor(nullptr), _border_size(0), _mode(BorderMode::UNDEFINED), _constant_border_value(static_cast<float>(0.f))
 {
 }
 
diff --git a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
index 017e259..d1cff6f 100644
--- a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
@@ -42,7 +42,7 @@
 } // namespace arm_compute
 
 NEFillInnerBorderKernel::NEFillInnerBorderKernel()
-    : _tensor(nullptr), _border_size(0), _constant_border_value(0)
+    : _tensor(nullptr), _border_size(0), _constant_border_value(static_cast<float>(0.f))
 {
 }
 
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index dd85ac1..72b652d 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -39,10 +39,8 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    set_data_type_if_unknown(*input->info(), DataType::F32);
-    set_data_type_if_unknown(*output->info(), DataType::F32);
+    // Auto initialize output
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
 
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index 6fac797..004ecd0 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp
@@ -130,7 +130,7 @@
     }
 }
 
-std::tuple<int, int, int> NELKTrackerKernel::compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int *bilinear_ix, int *bilinear_iy)
+std::tuple<int, int, int> NELKTrackerKernel::compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int32_t *bilinear_ix, int32_t *bilinear_iy)
 {
     int iA11 = 0;
     int iA12 = 0;
@@ -218,7 +218,8 @@
     return std::make_tuple(iA11, iA12, iA22);
 }
 
-std::pair<int, int> NELKTrackerKernel::compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int *bilinear_ix, const int *bilinear_iy)
+std::pair<int, int> NELKTrackerKernel::compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int32_t *bilinear_ix,
+                                                                     const int32_t *bilinear_iy)
 {
     int ib1 = 0;
     int ib2 = 0;
@@ -402,8 +403,8 @@
     init_keypoints(list_start, list_end);
 
     const int buffer_size = _window_dimension * _window_dimension;
-    int       bilinear_ix[buffer_size];
-    int       bilinear_iy[buffer_size];
+    int32_t   bilinear_ix[buffer_size];
+    int32_t   bilinear_iy[buffer_size];
 
     const int half_window = _window_dimension / 2;
 
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index fc3f5f2..d6d26e2 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -61,7 +61,7 @@
         ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
     }
 
-    const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min(norm_info.norm_size() / 2, 3U);
+    const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
 
     _input         = input;
     _input_squared = input_squared;
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index b97564e..8d4e465 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -108,14 +108,13 @@
     std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
-    static const std::set<int> supported_pool_sizes = { 2, 3, 7 };
+    static const std::set<int> supported_pool_sizes = { 2, 3 };
     ARM_COMPUTE_UNUSED(supported_pool_sizes);
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->info()->data_type()));
-    ARM_COMPUTE_ERROR_ON(supported_pool_sizes.find(pool_size) == supported_pool_sizes.end());
-    ARM_COMPUTE_ERROR_ON(7 == pool_size && input->info()->data_type() != DataType::F32);
+    ARM_COMPUTE_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && (input->info()->data_type() != DataType::F32));
     ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
     ARM_COMPUTE_ERROR_ON(is_data_type_fixed_point(input->info()->data_type()) && pool_stride_x > 2);
 
@@ -207,7 +206,7 @@
                     num_elems_read_per_iteration = 8; // We use vload8 for pooling7
                     break;
                 default:
-                    ARM_COMPUTE_ERROR("Pooling size not supported");
+                    num_elems_read_per_iteration = 1; // We use vload4 for poolingN but with a leftover for loop
                     break;
             }
             num_elems_processed_per_iteration = 1;
@@ -380,7 +379,20 @@
             }
             break;
         default:
-            ARM_COMPUTE_ERROR("Unsupported pooling size");
+            switch(pool_type)
+            {
+                case PoolingType::AVG:
+                    _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG>;
+                    break;
+                case PoolingType::L2:
+                    _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2>;
+                    break;
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+            }
             break;
     }
 
@@ -1005,6 +1017,127 @@
     input, output);
 }
 
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int pool_size     = _pool_info.pool_size();
+    int       pool_pad_x    = 0;
+    int       pool_pad_y    = 0;
+    int       pool_stride_x = 0;
+    int       pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float res = 0.0f;
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+
+            // Perform pooling
+            float32x4_t vres = vdupq_n_f32(0.0f);
+
+            for(int y = 0; y < pool_size; ++y)
+            {
+                int x = 0;
+                for(; x <= (pool_size - 4); x += 4)
+                {
+                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
+                                                                                       (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+
+                    // Get power of 2 in case of l2 pooling and accumulate
+                    if(pooling_type == PoolingType::L2)
+                    {
+                        vres = vmlaq_f32(vres, data, data);
+                    }
+                    else
+                    {
+                        vres = vaddq_f32(vres, data);
+                    }
+                }
+
+                // Leftover for loop
+                for(; x < pool_size; ++x)
+                {
+                    float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+
+                    // Get power of 2 in case of l2 pooling
+                    if(pooling_type == PoolingType::L2)
+                    {
+                        data *= data;
+                    }
+
+                    res += data;
+                }
+            }
+
+#if defined(__aarch64__)
+            // Reduction operation available on 64 bit architectures only
+            res += vaddvq_f32(vres);
+#else  // __aarch64__
+            // Reduction
+            float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
+            tmp             = vpadd_f32(tmp, tmp);
+
+            res += vget_lane_f32(tmp, 0);
+#endif // __aarch64__
+            // Divide by scale
+            res *= scale;
+        }
+        else
+        {
+            float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::min());
+            res              = std::numeric_limits<float>::min();
+
+            for(int y = 0; y < pool_size; ++y)
+            {
+                int x = 0;
+                for(; x <= (pool_size - 4); x += 4)
+                {
+                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
+                                                                                       (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    vres                   = vmaxq_f32(vres, data);
+                }
+
+                // Leftover for loop
+                for(; x < pool_size; ++x)
+                {
+                    const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    res              = std::max(res, data);
+                }
+            }
+
+#if defined(__aarch64__)
+            // Reduction operation available on 64 bit architectures only
+            res = std::max(vmaxvq_f32(vres), res);
+#else  // __aarch64__
+            float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
+            tmp             = vpmax_f32(tmp, tmp);
+
+            res = std::max(res, vget_lane_f32(tmp, 0));
+#endif // __aarch64__
+        }
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            res = std::sqrt(res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = res;
+    },
+    input, output);
+}
+
 void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index 62f4e5d..ab8ab14 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -143,7 +143,11 @@
     const float start_y0 = M10 * window.x().start();
 
     // Current row
-    int y_cur = window.y().start();
+    int y_cur  = window.y().start();
+    int z_cur  = window.z().start();
+    int d3_cur = window[3].start();
+    int d4_cur = window[4].start();
+    int d5_cur = window[5].start();
 
     // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
     float const_x0 = M01 * y_cur + M02;
@@ -155,10 +159,14 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
-        if(y_cur != id.y())
+        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
         {
-            y_cur = id.y();
+            y_cur  = id.y();
+            z_cur  = id.z();
+            d3_cur = id[3];
+            d4_cur = id[4];
+            d5_cur = id[5];
 
             const_x0 = M01 * y_cur + M02;
             const_y0 = M11 * y_cur + M12;
@@ -222,7 +230,11 @@
     const float start_y0 = M10 * window.x().start();
 
     // Current row
-    int y_cur = window.y().start();
+    int y_cur  = window.y().start();
+    int z_cur  = window.z().start();
+    int d3_cur = window[3].start();
+    int d4_cur = window[4].start();
+    int d5_cur = window[5].start();
 
     // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
     float const_x0 = M01 * y_cur + M02;
@@ -234,10 +246,14 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
-        if(y_cur != id.y())
+        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
         {
-            y_cur = id.y();
+            y_cur  = id.y();
+            z_cur  = id.z();
+            d3_cur = id[3];
+            d4_cur = id[4];
+            d5_cur = id[5];
 
             const_x0 = M01 * y_cur + M02;
             const_y0 = M11 * y_cur + M12;
@@ -264,7 +280,34 @@
         }
         else
         {
-            *out.ptr() = _constant_border_value;
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = _constant_border_value;
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                {
+                    const auto xi   = clamp<int>(std::floor(x0), min_x - 1, max_x);
+                    const auto yi   = clamp<int>(std::floor(y0), min_y - 1, max_y);
+                    const auto xi_1 = clamp<int>(std::floor(x0 + 1), min_x - 1, max_x);
+                    const auto yi_1 = clamp<int>(std::floor(y0 + 1), min_y - 1, max_y);
+
+                    const float dx  = x0 - std::floor(x0);
+                    const float dy  = y0 - std::floor(y0);
+                    const float dx1 = 1.0f - dx;
+                    const float dy1 = 1.0f - dy;
+
+                    const float a00 = *(in.ptr() + xi + yi * stride);
+                    const float a01 = *(in.ptr() + xi_1 + yi * stride);
+                    const float a10 = *(in.ptr() + xi + yi_1 * stride);
+                    const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
+
+                    *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
+                }
+                break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
         }
 
         x0 += M00;
@@ -292,7 +335,11 @@
     const size_t stride = _input->info()->strides_in_bytes()[1];
 
     // Current row
-    int y_cur = window.y().start();
+    int y_cur  = window.y().start();
+    int z_cur  = window.z().start();
+    int d3_cur = window[3].start();
+    int d4_cur = window[4].start();
+    int d5_cur = window[5].start();
 
     const float M00 = _matrix[0];
     const float M10 = _matrix[1];
@@ -314,10 +361,14 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
-        if(y_cur != id.y())
+        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
         {
-            y_cur = id.y();
+            y_cur  = id.y();
+            z_cur  = id.z();
+            d3_cur = id[3];
+            d4_cur = id[4];
+            d5_cur = id[5];
 
             const_x0 = M01 * y_cur + M02;
             const_y0 = M11 * y_cur + M12;
@@ -345,10 +396,34 @@
         else
         {
             // Clamp coordinates
-            const auto xi = clamp<int>(x0, min_x, max_x - 1);
-            const auto yi = clamp<int>(y0, min_y, max_y - 1);
+            const auto xi = clamp<int>(std::floor(x0), min_x, max_x - 1);
+            const auto yi = clamp<int>(std::floor(y0), min_y, max_y - 1);
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = *(in.ptr() + xi + yi * stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                {
+                    const auto xi_1 = clamp<int>(std::floor(x0 + 1), min_x, max_x - 1);
+                    const auto yi_1 = clamp<int>(std::floor(y0 + 1), min_y, max_y - 1);
 
-            *out.ptr() = *(in.ptr() + xi + yi * stride);
+                    const float dx  = x0 - std::floor(x0);
+                    const float dy  = y0 - std::floor(y0);
+                    const float dx1 = 1.0f - dx;
+                    const float dy1 = 1.0f - dy;
+
+                    const float a00 = *(in.ptr() + xi + yi * stride);
+                    const float a01 = *(in.ptr() + xi_1 + yi * stride);
+                    const float a10 = *(in.ptr() + xi + yi_1 * stride);
+                    const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
+
+                    *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
+                }
+                break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
         }
 
         x0 += M00;
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 525506f..7dddb1c 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -46,28 +46,22 @@
      *
      * @param _next_hint Device execution hint
      */
-    void configure(Hint _next_hint);
+    void configure(GraphHints _next_hints);
 
-    /** Sets whether to enable information print out
-     *
-     * @param[in] is_enabled Set to true if need info printed out
-     */
-    void set_info_enablement(bool is_enabled);
-
+    GraphContext                         _ctx{};
     std::vector<Stage>                   _pipeline{};
     std::vector<std::unique_ptr<Tensor>> _tensors{};
     std::vector<std::unique_ptr<INode>>  _nodes{};
-    Hint                                 _current_hint{ Hint::DONT_CARE };
-    Hint                                 _next_hint{ Hint::DONT_CARE };
+    GraphHints                           _current_hints{};
+    GraphHints                           _next_hints{};
     std::unique_ptr<Tensor>              _graph_input{ nullptr };
     std::unique_ptr<Tensor>              _graph_output{ nullptr };
     std::unique_ptr<INode>               _current_node{ nullptr };
     Tensor                              *_current_output{ nullptr };
-    bool                                 _info_enabled{ false };
 
 private:
-    Tensor *_current_input{ nullptr };
-    Hint    _previous_hint{ Hint::DONT_CARE };
+    Tensor    *_current_input{ nullptr };
+    GraphHints _previous_hints{};
 };
 
 Graph::~Graph() //NOLINT
@@ -102,7 +96,7 @@
 }
 
 //Finalize current node's configuration
-void Graph::Private::configure(Hint _next_hint)
+void Graph::Private::configure(GraphHints _next_hints)
 {
     ARM_COMPUTE_ERROR_ON(_current_node == nullptr);
     ARM_COMPUTE_ERROR_ON(_graph_input == nullptr);
@@ -110,9 +104,9 @@
     // Is it the first node of the graph ?
     if(_current_input == nullptr)
     {
-        _graph_input->set_target(_current_hint);
-        _current_input = _graph_input.get();
-        _previous_hint = _current_hint; // For the first node just assume the previous node was of the same type as this one
+        _graph_input->set_target(_current_hints.target_hint());
+        _current_input  = _graph_input.get();
+        _previous_hints = _current_hints; // For the first node just assume the previous node was of the same type as this one
     }
 
     //Automatic output configuration ?
@@ -123,29 +117,31 @@
     }
 
     // If either the writer or reader node needs OpenCL then use OpenCL memory:
-    if((_next_hint == Hint::OPENCL || _current_hint == Hint::OPENCL))
+    if((_next_hints.target_hint() == TargetHint::OPENCL || _current_hints.target_hint() == TargetHint::OPENCL))
     {
-        _current_output->set_target(Hint::OPENCL);
+        _current_output->set_target(TargetHint::OPENCL);
     }
     else
     {
-        _current_output->set_target(Hint::NEON);
+        _current_output->set_target(TargetHint::NEON);
     }
 
-    // Map input if needed
-    std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_current_hint, _current_input->tensor(), _current_output->tensor());
+    // Update ctx and instantiate node
+    _ctx.hints()                                 = _current_hints;
+    std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_ctx, _current_input->tensor(), _current_output->tensor());
     _current_input->allocate();
 
-    if(_current_input->target() == Hint::OPENCL)
+    // Map input if needed
+    if(_current_input->target() == TargetHint::OPENCL)
     {
-        if(_previous_hint == Hint::NEON)
+        if(_previous_hints.target_hint() == TargetHint::NEON)
         {
-            ARM_COMPUTE_ERROR_ON(_current_hint == Hint::NEON);
+            ARM_COMPUTE_ERROR_ON(_current_hints.target_hint() == TargetHint::NEON);
             _pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLUnmap>(_current_input) });
         }
-        if(_current_hint == Hint::NEON)
+        if(_current_hints.target_hint() == TargetHint::NEON)
         {
-            ARM_COMPUTE_ERROR_ON(_previous_hint == Hint::NEON);
+            ARM_COMPUTE_ERROR_ON(_previous_hints.target_hint() == TargetHint::NEON);
             _pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLMap>(_current_input, true) });
         }
     }
@@ -154,13 +150,8 @@
 
     _current_input  = _current_output;
     _current_output = nullptr;
-    _previous_hint  = _current_hint;
-    _current_hint   = _next_hint;
-}
-
-void Graph::Private::set_info_enablement(bool is_enabled)
-{
-    _info_enabled = is_enabled;
+    std::swap(_previous_hints, _current_hints);
+    std::swap(_current_hints, _next_hints);
 }
 
 void Graph::add_node(std::unique_ptr<INode> node)
@@ -169,22 +160,18 @@
     ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_graph_output != nullptr, "Nothing can be added after the output tensor");
     //Trigger the creation of the current Node:
 
-    Hint _next_hint = node->override_hint(_pimpl->_next_hint);
-    ARM_COMPUTE_ERROR_ON(_next_hint == Hint::DONT_CARE);
+    GraphHints _next_hints = _pimpl->_next_hints;
+    _next_hints.set_target_hint(node->override_target_hint(_pimpl->_next_hints.target_hint()));
+    ARM_COMPUTE_ERROR_ON(_next_hints.target_hint() == TargetHint::DONT_CARE);
     if(_pimpl->_current_node)
     {
         //Finalize the previous Node:
-        _pimpl->configure(_pimpl->_next_hint);
-
-        if(_pimpl->_info_enabled)
-        {
-            _pimpl->_current_node->print_info();
-        }
+        _pimpl->configure(_pimpl->_next_hints);
     }
     else
     {
-        // If that's the first node then use the same Hint before and after the node.
-        _pimpl->_current_hint = _next_hint;
+        // If that's the first node then use the same TargetHint before and after the node.
+        _pimpl->_current_hints = _next_hints;
     }
     if(_pimpl->_current_node)
     {
@@ -192,15 +179,6 @@
     }
     _pimpl->_current_node = std::move(node);
 }
-void Graph::set_hint(Hint hint)
-{
-    _pimpl->_next_hint = hint;
-}
-
-void Graph::set_info_enablement(bool is_enabled)
-{
-    _pimpl->set_info_enablement(is_enabled);
-}
 
 //Add a tensor with an Accessor (i.e either the input or output of the graph)
 void Graph::add_tensor(std::unique_ptr<Tensor> tensor)
@@ -221,7 +199,7 @@
         _pimpl->_current_output = _pimpl->_graph_output.get();
 
         // Finalize the graph by configuring the last Node of the graph:
-        _pimpl->configure(_pimpl->_current_hint); // Ignore _next_hint as this is the last node, and just use the same hint as before this node.
+        _pimpl->configure(_pimpl->_current_hints); // Ignore _next_hint as this is the last node, and just use the same hint as before this node.
         _pimpl->_graph_output->allocate();
     }
 }
@@ -236,6 +214,11 @@
     _pimpl->_current_output = _pimpl->_tensors.back().get();
 }
 
+GraphHints &Graph::hints()
+{
+    return _pimpl->_next_hints;
+}
+
 Graph &arm_compute::graph::operator<<(Graph &graph, TensorInfo &&info)
 {
     graph.set_temp(std::move(info));
@@ -248,8 +231,14 @@
     return graph;
 }
 
-Graph &arm_compute::graph::operator<<(Graph &graph, Hint hint)
+Graph &arm_compute::graph::operator<<(Graph &graph, TargetHint target_hint)
 {
-    graph.set_hint(hint);
+    graph.hints().set_target_hint(target_hint);
+    return graph;
+}
+
+Graph &arm_compute::graph::operator<<(Graph &graph, ConvolutionMethodHint conv_method_hint)
+{
+    graph.hints().set_convolution_method_hint(conv_method_hint);
     return graph;
 }
diff --git a/src/graph/GraphContext.cpp b/src/graph/GraphContext.cpp
new file mode 100644
index 0000000..bfc6fcd
--- /dev/null
+++ b/src/graph/GraphContext.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/GraphContext.h"
+
+using namespace arm_compute::graph;
+
+GraphHints::GraphHints(TargetHint target_hint, ConvolutionMethodHint conv_method_hint)
+    : _target_hint(target_hint), _convolution_method_hint(conv_method_hint)
+{
+}
+
+void GraphHints::set_target_hint(TargetHint target_hint)
+{
+    _target_hint = target_hint;
+}
+
+void GraphHints::set_convolution_method_hint(ConvolutionMethodHint convolution_method)
+{
+    _convolution_method_hint = convolution_method;
+}
+
+TargetHint GraphHints::target_hint() const
+{
+    return _target_hint;
+}
+
+ConvolutionMethodHint GraphHints::convolution_method_hint() const
+{
+    return _convolution_method_hint;
+}
+
+GraphContext::GraphContext()
+    : _hints()
+{
+}
+
+GraphHints &GraphContext::hints()
+{
+    return _hints;
+}
+
+const GraphHints &GraphContext::hints() const
+{
+    return _hints;
+}
\ No newline at end of file
diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index 6b25022..4b383f5 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/graph/INode.h"
 
 #include "arm_compute/core/CL/OpenCL.h"
@@ -31,17 +30,20 @@
 
 using namespace arm_compute::graph;
 
-Hint INode::override_hint(Hint hint) const
+TargetHint INode::override_target_hint(TargetHint target_hint) const
 {
-    if(hint == Hint::OPENCL && !opencl_is_available())
+    if(target_hint == TargetHint::OPENCL && !opencl_is_available())
     {
-        hint = Hint::DONT_CARE;
+        target_hint = TargetHint::DONT_CARE;
     }
-    hint = node_override_hint(hint);
-    ARM_COMPUTE_ERROR_ON(hint == Hint::OPENCL && !opencl_is_available());
-    return hint;
+    GraphHints hints{ target_hint };
+    target_hint = node_override_hints(hints).target_hint();
+    ARM_COMPUTE_ERROR_ON(target_hint == TargetHint::OPENCL && !opencl_is_available());
+    return target_hint;
 }
-Hint INode::node_override_hint(Hint hint) const
+GraphHints INode::node_override_hints(GraphHints hints) const
 {
-    return hint == Hint::DONT_CARE ? Hint::NEON : hint;
+    TargetHint target_hint = hints.target_hint();
+    hints.set_target_hint((target_hint == TargetHint::DONT_CARE) ? TargetHint::NEON : target_hint);
+    return hints;
 }
diff --git a/src/graph/SubTensor.cpp b/src/graph/SubTensor.cpp
new file mode 100644
index 0000000..abf8506
--- /dev/null
+++ b/src/graph/SubTensor.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/SubTensor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/SubTensor.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename SubTensorType, typename ParentTensorType>
+std::unique_ptr<ITensor> initialise_subtensor(ITensor *parent, TensorShape shape, Coordinates coords)
+{
+    auto ptensor   = dynamic_cast<ParentTensorType *>(parent);
+    auto subtensor = arm_compute::support::cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+    return std::move(subtensor);
+}
+} // namespace
+
+SubTensor::SubTensor()
+    : _target(TargetHint::DONT_CARE), _coords(), _info(), _parent(nullptr), _subtensor(nullptr)
+{
+}
+
+SubTensor::SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coords)
+    : _target(TargetHint::DONT_CARE), _coords(coords), _info(), _parent(nullptr), _subtensor(nullptr)
+{
+    ARM_COMPUTE_ERROR_ON(parent.tensor() == nullptr);
+    _parent = parent.tensor();
+    _info   = SubTensorInfo(parent.tensor()->info(), tensor_shape, coords);
+    _target = parent.target();
+
+    instantiate_subtensor();
+}
+
+SubTensor::SubTensor(ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target)
+    : _target(target), _coords(coords), _info(), _parent(parent), _subtensor(nullptr)
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    _info = SubTensorInfo(parent->info(), tensor_shape, coords);
+
+    instantiate_subtensor();
+}
+
+void SubTensor::set_info(SubTensorInfo &&info)
+{
+    _info = info;
+}
+
+const SubTensorInfo &SubTensor::info() const
+{
+    return _info;
+}
+
+ITensor *SubTensor::tensor()
+{
+    return _subtensor.get();
+}
+
+TargetHint SubTensor::target() const
+{
+    return _target;
+}
+
+void SubTensor::instantiate_subtensor()
+{
+    switch(_target)
+    {
+        case TargetHint::OPENCL:
+            _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _info.tensor_shape(), _coords);
+            break;
+        case TargetHint::NEON:
+            _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _info.tensor_shape(), _coords);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Invalid TargetHint");
+    }
+}
diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
index c534ae0..31dd4e8 100644
--- a/src/graph/Tensor.cpp
+++ b/src/graph/Tensor.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/graph/Tensor.h"
 
 #include "arm_compute/core/Error.h"
@@ -53,7 +52,7 @@
 } // namespace
 
 Tensor::Tensor(TensorInfo &&info)
-    : _target(Hint::DONT_CARE), _info(info), _accessor(nullptr), _tensor(nullptr)
+    : _target(TargetHint::DONT_CARE), _info(info), _accessor(nullptr), _tensor(nullptr)
 {
 }
 
@@ -96,7 +95,7 @@
     return _info;
 }
 
-ITensor *Tensor::set_target(Hint target)
+ITensor *Tensor::set_target(TargetHint target)
 {
     if(_tensor != nullptr)
     {
@@ -106,14 +105,14 @@
     {
         switch(target)
         {
-            case Hint::OPENCL:
+            case TargetHint::OPENCL:
                 _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
                 break;
-            case Hint::NEON:
+            case TargetHint::NEON:
                 _tensor = initialise_tensor<arm_compute::Tensor>(_info);
                 break;
             default:
-                ARM_COMPUTE_ERROR("Invalid Hint");
+                ARM_COMPUTE_ERROR("Invalid TargetHint");
         }
         _target = target;
     }
@@ -125,14 +124,14 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor.get());
     switch(_target)
     {
-        case Hint::OPENCL:
+        case TargetHint::OPENCL:
             tensor_allocate<arm_compute::CLTensor>(*_tensor);
             break;
-        case Hint::NEON:
+        case TargetHint::NEON:
             tensor_allocate<arm_compute::Tensor>(*_tensor);
             break;
         default:
-            ARM_COMPUTE_ERROR("Invalid Hint");
+            ARM_COMPUTE_ERROR("Invalid TargetHint");
     }
 }
 
@@ -145,7 +144,7 @@
     }
 }
 
-Hint Tensor::target() const
+TargetHint Tensor::target() const
 {
     return _target;
 }
diff --git a/src/graph/nodes/ActivationLayer.cpp b/src/graph/nodes/ActivationLayer.cpp
index b71e22c..5cd2a0b 100644
--- a/src/graph/nodes/ActivationLayer.cpp
+++ b/src/graph/nodes/ActivationLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/graph/nodes/ActivationLayer.h"
 
+#include "arm_compute/core/Logger.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
@@ -34,7 +35,7 @@
 
 namespace
 {
-template <typename ActivationType, typename TensorType, Hint hint>
+template <typename ActivationType, typename TensorType, TargetHint target_hint>
 std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
 {
     auto activation = arm_compute::support::cpp14::make_unique<ActivationType>();
@@ -46,19 +47,19 @@
     return std::move(activation);
 }
 
-template <Hint                          hint>
+template <TargetHint                    target_hint>
 std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info);
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
 {
-    return instantiate_function<arm_compute::CLActivationLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output, activation_info);
+    return instantiate_function<arm_compute::CLActivationLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output, activation_info);
 }
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
 {
-    return instantiate_function<arm_compute::NEActivationLayer, arm_compute::Tensor, Hint::NEON>(input, output, activation_info);
+    return instantiate_function<arm_compute::NEActivationLayer, arm_compute::Tensor, TargetHint::NEON>(input, output, activation_info);
 }
 } // namespace
 
@@ -67,40 +68,28 @@
 {
 }
 
-std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
 {
     std::unique_ptr<arm_compute::IFunction> func;
-    _hint   = hint;
-    _input  = input;
-    _output = output;
+    _target_hint = ctx.hints().target_hint();
 
-    if(_hint == Hint::OPENCL)
+    if(_target_hint == TargetHint::OPENCL)
     {
-        func = instantiate<Hint::OPENCL>(input, output, _activation_info);
+        func = instantiate<TargetHint::OPENCL>(input, output, _activation_info);
+        ARM_COMPUTE_LOG("Instantiating CLActivationLayer");
     }
     else
     {
-        func = instantiate<Hint::NEON>(input, output, _activation_info);
+        func = instantiate<TargetHint::NEON>(input, output, _activation_info);
+        ARM_COMPUTE_LOG("Instantiating NEActivationLayer");
     }
+
+    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+                    << " Input shape: " << input->info()->tensor_shape()
+                    << " Output shape: " << output->info()->tensor_shape()
+                    << " Activation function: " << _activation_info.activation()
+                    << " a: " << _activation_info.a()
+                    << " b: " << _activation_info.b()
+                    << std::endl);
     return func;
 }
-
-void ActivationLayer::print_info()
-{
-    if(_hint == Hint::OPENCL)
-    {
-        std::cout << "Instantiating CLActivationLayer";
-    }
-    else
-    {
-        std::cout << "Instantiating NEActivationLayer";
-    }
-
-    std::cout << " Data Type: " << _input->info()->data_type()
-              << " Input shape: " << _input->info()->tensor_shape()
-              << " Output shape: " << _output->info()->tensor_shape()
-              << " Activation function: " << _activation_info.activation()
-              << " a: " << _activation_info.a()
-              << " b: " << _activation_info.b()
-              << std::endl;
-}
diff --git a/src/graph/nodes/BatchNormalizationLayer.cpp b/src/graph/nodes/BatchNormalizationLayer.cpp
new file mode 100644
index 0000000..a6a990f
--- /dev/null
+++ b/src/graph/nodes/BatchNormalizationLayer.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/BatchNormalizationLayer.h"
+
+#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename BatchBatchNormalizationLayer, typename TensorType, TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon)
+{
+    auto norm = arm_compute::support::cpp14::make_unique<BatchBatchNormalizationLayer>();
+    norm->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(output),
+        dynamic_cast<TensorType *>(mean.set_target(target_hint)),
+        dynamic_cast<TensorType *>(var.set_target(target_hint)),
+        dynamic_cast<TensorType *>(beta.set_target(target_hint)),
+        dynamic_cast<TensorType *>(gamma.set_target(target_hint)),
+        epsilon);
+
+    return std::move(norm);
+}
+
+template <TargetHint                    target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon)
+{
+    return instantiate_function<arm_compute::CLBatchNormalizationLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, output, mean, var, beta, gamma, epsilon);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon)
+{
+    return instantiate_function<arm_compute::NEBatchNormalizationLayer, arm_compute::ITensor, TargetHint::NEON>(input, output, mean, var, beta, gamma, epsilon);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> BatchNormalizationLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+{
+    std::unique_ptr<arm_compute::IFunction> func;
+    _target_hint = ctx.hints().target_hint();
+
+    unsigned int batch_norm_size = input->info()->dimension(2);
+    if(_mean.tensor() == nullptr)
+    {
+        _mean.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    }
+    if(_var.tensor() == nullptr)
+    {
+        _var.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    }
+    if(_beta.tensor() == nullptr)
+    {
+        _beta.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    }
+    if(_gamma.tensor() == nullptr)
+    {
+        _gamma.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    }
+
+    if(_target_hint == TargetHint::OPENCL)
+    {
+        func = instantiate<TargetHint::OPENCL>(input, output, _mean, _var, _beta, _gamma, _epsilon);
+        ARM_COMPUTE_LOG("Instantiating CLBatchNormalizationLayer");
+    }
+    else
+    {
+        func = instantiate<TargetHint::NEON>(input, output, _mean, _var, _beta, _gamma, _epsilon);
+        ARM_COMPUTE_LOG("Instantiating NEBatchNormalizationLayer");
+    }
+
+    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+                    << " Input shape: " << input->info()->tensor_shape()
+                    << " Output shape: " << output->info()->tensor_shape()
+                    << std::endl);
+
+    return func;
+}
\ No newline at end of file
diff --git a/src/graph/nodes/ConvolutionLayer.cpp b/src/graph/nodes/ConvolutionLayer.cpp
index b80bf93..b47be8d 100644
--- a/src/graph/nodes/ConvolutionLayer.cpp
+++ b/src/graph/nodes/ConvolutionLayer.cpp
@@ -23,61 +23,159 @@
  */
 #include "arm_compute/graph/nodes/ConvolutionLayer.h"
 
+#include "arm_compute/core/Logger.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
 #include "support/ToolchainSupport.h"
+#include "utils/GraphTypePrinter.h"
 #include "utils/TypePrinter.h"
 
+#include <tuple>
+#include <vector>
+
 using namespace arm_compute::graph;
 
 namespace
 {
-template <typename ConvolutionType, typename TensorType, Hint hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+/** Calculates the output shaped of the convolution layer
+ *
+ * @param[in] input_shape   Input tensor shape
+ * @param[in] weights_shape Weights shape
+ * @param[in] conv_info     Convolution information (padding, stride, etc.)
+ *
+ * @return The expected output tensor shape
+ */
+TensorShape calculate_convolution_layer_output_shape(const TensorShape &input_shape, const TensorShape &weights_shape, const PadStrideInfo &conv_info)
 {
-    bool weights_are_loaded = weights.tensor() != nullptr;
-    bool biases_are_loaded  = biases.tensor() != nullptr;
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
 
+    // Get output width and height
+    std::tie(output_width, output_height) = arm_compute::scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
+
+    // Create output shape
+    TensorShape output_shape = input_shape;
+    output_shape.set(0, output_width);
+    output_shape.set(1, output_height);
+    output_shape.set(2, weights_shape[3]);
+
+    return output_shape;
+}
+
+// Instantiate GEMM based convolution layer
+template <typename ConvolutionType, typename TensorType, TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
     auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
     conv->configure(
         dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(weights.set_target(hint)),
-        dynamic_cast<TensorType *>(biases.set_target(hint)),
+        dynamic_cast<TensorType *>(weights),
+        dynamic_cast<TensorType *>(biases),
         dynamic_cast<TensorType *>(output),
         conv_info, weights_info);
-    if(!weights_are_loaded)
-    {
-        weights.allocate_and_fill_if_needed();
-    }
-    if(!biases_are_loaded)
-    {
-        biases.allocate_and_fill_if_needed();
-    }
-
     return std::move(conv);
 }
 
-template <Hint                          hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info);
+// Instantiate direct convolution layer
+template <typename ConvolutionType, typename TensorType, TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_direct_function(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+    auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
+    conv->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(weights),
+        dynamic_cast<TensorType *>(biases),
+        dynamic_cast<TensorType *>(output),
+        conv_info);
+    return std::move(conv);
+}
+
+template <TargetHint                    target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                                    ConvolutionMethodHint conv_method);
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                                                                        const WeightsInfo    &weights_info,
+                                                                        ConvolutionMethodHint conv_method)
 {
-    return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::CLTensor, Hint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
+    if(conv_method == ConvolutionMethodHint::GEMM)
+    {
+        return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
+    }
+    else
+    {
+        return instantiate_direct_function<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info);
+    }
 }
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                                                                      const WeightsInfo    &weights_info,
+                                                                      ConvolutionMethodHint conv_method)
 {
-    return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::Tensor, Hint::NEON>(input, weights, biases, output, conv_info, weights_info);
+    if(conv_method == ConvolutionMethodHint::GEMM)
+    {
+        return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info, weights_info);
+    }
+    else
+    {
+        return instantiate_direct_function<arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info);
+    }
 }
 } // namespace
 
-std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+/** Grouped Convolution function */
+class GroupedConvolutionFunction final : public arm_compute::IFunction
 {
+public:
+    /** Default Constructor */
+    GroupedConvolutionFunction()
+        : _convolutions()
+    {
+    }
+    /** Default Destructor */
+    ~GroupedConvolutionFunction() final = default;
+    /** Prevent instances from being copy constructed */
+    GroupedConvolutionFunction(const GroupedConvolutionFunction &) = delete;
+    /** Prevent instances from being copy assigned */
+    GroupedConvolutionFunction &operator=(const GroupedConvolutionFunction &) = delete;
+    /** Allow instances to be move constructed */
+    GroupedConvolutionFunction(GroupedConvolutionFunction &&) noexcept = default;
+    /** Allow instances to be move assigned */
+    GroupedConvolutionFunction &operator=(GroupedConvolutionFunction &&) noexcept = default;
+    /** Adds a convolution
+     *
+     * @param convolution Convolution function to add
+     */
+    void add_convolution_function(std::unique_ptr<IFunction> convolution)
+    {
+        _convolutions.emplace_back(std::move(convolution));
+    }
+
+    // Inherited methods overriden:
+    void run() override
+    {
+        for(auto &c : _convolutions)
+        {
+            c->run();
+        }
+    }
+
+private:
+    std::vector<std::unique_ptr<IFunction>> _convolutions;
+};
+
+std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+{
+    // Set weights and biases info
     if(_weights.tensor() == nullptr)
     {
-        _weights.set_info(TensorInfo(TensorShape(_conv_width, _conv_height, input->info()->dimension(2), _ofm), input->info()->num_channels(), input->info()->data_type(),
+        _weights.set_info(TensorInfo(TensorShape(_conv_width, _conv_height, input->info()->dimension(2) / _num_groups, _ofm),
+                                     input->info()->num_channels(), input->info()->data_type(),
                                      input->info()->fixed_point_position()));
     }
     if(_biases.tensor() == nullptr)
@@ -86,32 +184,139 @@
     }
 
     std::unique_ptr<arm_compute::IFunction> func;
-    _hint   = hint;
-    _input  = input;
-    _output = output;
+    _target_hint                                 = ctx.hints().target_hint();
+    const ConvolutionMethodHint conv_method_hint = ctx.hints().convolution_method_hint();
 
-    if(_hint == Hint::OPENCL)
+    // Check if the weights and biases are loaded
+    bool weights_are_loaded = _weights.tensor() != nullptr;
+    bool biases_are_loaded  = _weights.tensor() != nullptr;
+
+    // Set bias and weights target
+    _weights.set_target(_target_hint);
+    _biases.set_target(_target_hint);
+
+    // Calculate output shape
+    TensorShape output_shape = calculate_convolution_layer_output_shape(input->info()->tensor_shape(), _weights.info().tensor_shape(), _conv_info);
+
+    // Output auto inizialitation if not yet initialized
+    arm_compute::auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    // Create appropriate convolution function
+    if(_num_groups == 1)
     {
-        func = instantiate<Hint::OPENCL>(input, _weights, _biases, output, _conv_info, _weights_info);
+        func = instantiate_convolution(input, output, conv_method_hint);
+        ARM_COMPUTE_LOG("Instantiating CLConvolutionLayer");
     }
     else
     {
-        func = instantiate<Hint::NEON>(input, _weights, _biases, output, _conv_info, _weights_info);
+        func = instantiate_grouped_convolution(input, output, conv_method_hint);
+        ARM_COMPUTE_LOG("Instantiating NEConvolutionLayer");
     }
 
+    // Fill weights
+    if(!weights_are_loaded)
+    {
+        _weights.allocate_and_fill_if_needed();
+    }
+    // Fill biases
+    if(!biases_are_loaded)
+    {
+        _biases.allocate_and_fill_if_needed();
+    }
+
+    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+                    << " Input Shape: " << input->info()->tensor_shape()
+                    << " Weights shape: " << _weights.info().tensor_shape()
+                    << " Biases Shape: " << _biases.info().tensor_shape()
+                    << " Output Shape: " << output->info()->tensor_shape()
+                    << " PadStrideInfo: " << _conv_info
+                    << " Groups: " << _num_groups
+                    << " WeightsInfo: " << _weights_info
+                    << std::endl);
+
     return func;
 }
 
-void ConvolutionLayer::print_info()
+std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_convolution(ITensor *input, ITensor *output, ConvolutionMethodHint conv_method_hint)
 {
-    if(_hint == Hint::OPENCL)
+    std::unique_ptr<arm_compute::IFunction> func;
+    if(_target_hint == TargetHint::OPENCL)
     {
-        std::cout << "Instantiating CLConvolutionLayer";
+        func = instantiate<TargetHint::OPENCL>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
     }
     else
     {
-        std::cout << "Instantiating NEConvolutionLayer";
+        func = instantiate<TargetHint::NEON>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
     }
-    std::cout << " Type: " << _input->info()->data_type() << " Input Shape: " << _input->info()->tensor_shape() << " Weights shape: " << _weights.info().tensor_shape() << " Biases Shape: " <<
-              _biases.info().tensor_shape() << " Output Shape: " << _output->info()->tensor_shape() << " PadStrideInfo: " << _conv_info << "WeightsInfo: " << _weights_info << std::endl;
+    return func;
+}
+
+std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_grouped_convolution(ITensor *input, ITensor *output, ConvolutionMethodHint conv_method_hint)
+{
+    // Get tensor shapes
+    TensorShape input_shape   = input->info()->tensor_shape();
+    TensorShape output_shape  = output->info()->tensor_shape();
+    TensorShape weights_shape = _weights.info().tensor_shape();
+    TensorShape biases_shape  = _biases.info().tensor_shape();
+
+    ARM_COMPUTE_ERROR_ON_MSG((input_shape.z() % _num_groups) != 0, "Input depth not multiple of the number of groups!");
+    ARM_COMPUTE_ERROR_ON_MSG((output_shape.z() % _num_groups) != 0, "Output depth not multiple of the number of groups!");
+    ARM_COMPUTE_ERROR_ON_MSG((weights_shape[3] % _num_groups) != 0, "Number of kernels not multiple of the number of groups!");
+    ARM_COMPUTE_ERROR_ON_MSG((biases_shape.x() % _num_groups) != 0, "Biases not multiple of the number of groups!");
+
+    // Create a grouped convolution function
+    auto grouped_conv = arm_compute::support::cpp14::make_unique<GroupedConvolutionFunction>();
+
+    // Create sub-tensors vectors
+    _is = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
+    _os = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
+    _ws = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
+    _bs = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
+
+    // Calculate sub-tensor splits
+    const int input_split   = input_shape.z() / _num_groups;
+    const int output_split  = output_shape.z() / _num_groups;
+    const int weights_split = weights_shape[3] / _num_groups;
+    const int biases_split  = biases_shape.x() / _num_groups;
+
+    // Calculate sub-tensor shapes
+    input_shape.set(2, input_split);
+    output_shape.set(2, output_split);
+    weights_shape.set(3, weights_split);
+    biases_shape.set(0, biases_split);
+
+    // Configure sub-tensors
+    for(int i = 0; i < static_cast<int>(_num_groups); ++i)
+    {
+        // Create convolution function
+        std::unique_ptr<arm_compute::IFunction> func;
+
+        // Calculate sub-tensors starting coordinates
+        Coordinates input_coord(0, 0, input_split * i);
+        Coordinates output_coord(0, 0, output_split * i);
+        Coordinates weights_coord(0, 0, 0, weights_split * i);
+        Coordinates biases_coord(biases_split * i);
+
+        // Create sub-tensors for input, output, weights and bias
+        auto hint_to_use = (_target_hint == TargetHint::OPENCL) ? TargetHint::OPENCL : TargetHint::NEON;
+        _is[i]           = SubTensor(input, input_shape, input_coord, hint_to_use);
+        _os[i]           = SubTensor(output, output_shape, output_coord, hint_to_use);
+        _ws[i]           = SubTensor(_weights.tensor(), weights_shape, weights_coord, hint_to_use);
+        _bs[i]           = SubTensor(_biases.tensor(), biases_shape, biases_coord, hint_to_use);
+
+        // Instantiate convolution function
+        if(_target_hint == TargetHint::OPENCL)
+        {
+            func = instantiate<TargetHint::OPENCL>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
+        }
+        else
+        {
+            func = instantiate<TargetHint::NEON>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
+        }
+
+        // Add convolution function to the list of convolutions for the grouped convolution
+        grouped_conv->add_convolution_function(std::move(func));
+    }
+
+    return std::move(grouped_conv);
 }
diff --git a/src/graph/nodes/FloorLayer.cpp b/src/graph/nodes/FloorLayer.cpp
new file mode 100644
index 0000000..722cfdf
--- /dev/null
+++ b/src/graph/nodes/FloorLayer.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/FloorLayer.h"
+
+#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLFloor.h"
+#include "arm_compute/runtime/NEON/functions/NEFloor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename FloorType, typename TensorType, TargetHint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output)
+{
+    auto floorlayer = arm_compute::support::cpp14::make_unique<FloorType>();
+    floorlayer->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(output));
+
+    return std::move(floorlayer);
+}
+
+template <TargetHint                    target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output)
+{
+    return instantiate_function<arm_compute::CLFloor, arm_compute::ICLTensor, TargetHint::OPENCL>(input, output);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output)
+{
+    return instantiate_function<arm_compute::NEFloor, arm_compute::ITensor, TargetHint::NEON>(input, output);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> FloorLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+{
+    std::unique_ptr<arm_compute::IFunction> func;
+    _target_hint = ctx.hints().target_hint();
+
+    if(_target_hint == TargetHint::OPENCL)
+    {
+        func = instantiate<TargetHint::OPENCL>(input, output);
+        ARM_COMPUTE_LOG("Instantiating CLFloorLayer");
+    }
+    else
+    {
+        func = instantiate<TargetHint::NEON>(input, output);
+        ARM_COMPUTE_LOG("Instantiating NEFloorLayer");
+    }
+
+    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+                    << " Input shape: " << input->info()->tensor_shape()
+                    << " Output shape: " << output->info()->tensor_shape()
+                    << std::endl);
+
+    return func;
+}
diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
index 8d244cb..6b21810 100644
--- a/src/graph/nodes/FullyConnectedLayer.cpp
+++ b/src/graph/nodes/FullyConnectedLayer.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/graph/nodes/FullyConnectedLayer.h"
 
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Logger.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "support/ToolchainSupport.h"
@@ -33,7 +34,17 @@
 
 namespace
 {
-template <typename FullyConnectedType, typename TensorType, Hint hint>
+TensorShape calculate_fullyconnected_layer_output_shape(const TensorShape &input_shape, unsigned int output_neurons)
+{
+    // Note: Only 1D batch space is supported at the moment
+    unsigned int batches = input_shape[1];
+    if(input_shape.num_dimensions() > 2)
+    {
+        batches = input_shape[3];
+    }
+    return TensorShape(output_neurons, batches);
+}
+template <typename FullyConnectedType, typename TensorType, TargetHint target_hint>
 std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
 {
     bool weights_are_loaded = weights.tensor() != nullptr;
@@ -42,8 +53,8 @@
     auto conv = arm_compute::support::cpp14::make_unique<FullyConnectedType>();
     conv->configure(
         dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(weights.set_target(hint)),
-        dynamic_cast<TensorType *>(biases.set_target(hint)),
+        dynamic_cast<TensorType *>(weights.set_target(target_hint)),
+        dynamic_cast<TensorType *>(biases.set_target(target_hint)),
         dynamic_cast<TensorType *>(output));
     if(!weights_are_loaded)
     {
@@ -57,23 +68,23 @@
     return std::move(conv);
 }
 
-template <Hint                          hint>
+template <TargetHint                    target_hint>
 std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output);
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
 {
-    return instantiate_function<arm_compute::CLFullyConnectedLayer, arm_compute::CLTensor, Hint::OPENCL>(input, weights, biases, output);
+    return instantiate_function<arm_compute::CLFullyConnectedLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, weights, biases, output);
 }
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
 {
-    return instantiate_function<arm_compute::NEFullyConnectedLayer, arm_compute::Tensor, Hint::NEON>(input, weights, biases, output);
+    return instantiate_function<arm_compute::NEFullyConnectedLayer, arm_compute::Tensor, TargetHint::NEON>(input, weights, biases, output);
 }
 } // namespace
 
-std::unique_ptr<arm_compute::IFunction> FullyConnectedLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> FullyConnectedLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
 {
     if(_weights.tensor() == nullptr)
     {
@@ -95,36 +106,31 @@
         _biases.set_info(TensorInfo(TensorShape(_num_neurons), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
     }
 
-    arm_compute::auto_init_if_empty(*output->info(), TensorShape(_num_neurons, input->info()->dimension(1)), input->info()->num_channels(), input->info()->data_type(),
-                                    input->info()->fixed_point_position());
+    // Auto configure output
+    arm_compute::auto_init_if_empty(*output->info(),
+                                    calculate_fullyconnected_layer_output_shape(input->info()->tensor_shape(), _num_neurons),
+                                    input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
 
     std::unique_ptr<arm_compute::IFunction> func;
-    _hint   = hint;
-    _input  = input;
-    _output = output;
+    _target_hint = ctx.hints().target_hint();
 
-    if(_hint == Hint::OPENCL)
+    if(_target_hint == TargetHint::OPENCL)
     {
-        func = instantiate<Hint::OPENCL>(input, _weights, _biases, output);
+        func = instantiate<TargetHint::OPENCL>(input, _weights, _biases, output);
+        ARM_COMPUTE_LOG("Instantiating CLFullyConnectedLayer");
     }
     else
     {
-        func = instantiate<Hint::NEON>(input, _weights, _biases, output);
+        func = instantiate<TargetHint::NEON>(input, _weights, _biases, output);
+        ARM_COMPUTE_LOG("Instantiating NEFullyConnectedLayer");
     }
 
+    ARM_COMPUTE_LOG(" Type: " << input->info()->data_type()
+                    << " Input Shape: " << input->info()->tensor_shape()
+                    << " Weights shape: " << _weights.info().tensor_shape()
+                    << " Biases Shape: " << _biases.info().tensor_shape()
+                    << " Output Shape: " << output->info()->tensor_shape()
+                    << std::endl);
+
     return func;
 }
-
-void FullyConnectedLayer::print_info()
-{
-    if(_hint == Hint::OPENCL)
-    {
-        std::cout << "Instantiating CLFullyConnectedLayer";
-    }
-    else
-    {
-        std::cout << "Instantiating NEFullyConnectedLayer";
-    }
-    std::cout << " Type: " << _input->info()->data_type() << " Input Shape: " << _input->info()->tensor_shape() << " Weights shape: " << _weights.info().tensor_shape() << " Biases Shape: " <<
-              _biases.info().tensor_shape() << " Output Shape: " << _output->info()->tensor_shape() << std::endl;
-}
diff --git a/src/graph/nodes/L2NormalizeLayer.cpp b/src/graph/nodes/L2NormalizeLayer.cpp
new file mode 100644
index 0000000..46d1552
--- /dev/null
+++ b/src/graph/nodes/L2NormalizeLayer.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/L2NormalizeLayer.h"
+
+#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLL2Normalize.h"
+#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename L2NormalizeType, typename TensorType, TargetHint hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+{
+    auto l2norm = arm_compute::support::cpp14::make_unique<L2NormalizeType>();
+    l2norm->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(output),
+        axis,
+        epsilon);
+
+    return std::move(l2norm);
+}
+
+template <TargetHint                    target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, unsigned int axis, float epsilon);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+{
+    return instantiate_function<arm_compute::CLL2Normalize, arm_compute::ICLTensor, TargetHint::OPENCL>(input, output, axis, epsilon);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+{
+    return instantiate_function<arm_compute::NEL2Normalize, arm_compute::ITensor, TargetHint::NEON>(input, output, axis, epsilon);
+}
+} // namespace
+
+std::unique_ptr<arm_compute::IFunction> L2NormalizeLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+{
+    std::unique_ptr<arm_compute::IFunction> func;
+    _target_hint = ctx.hints().target_hint();
+
+    if(_target_hint == TargetHint::OPENCL)
+    {
+        func = instantiate<TargetHint::OPENCL>(input, output, _axis, _epsilon);
+        ARM_COMPUTE_LOG("Instantiating CLL2NormalizeLayer");
+    }
+    else
+    {
+        func = instantiate<TargetHint::NEON>(input, output, _axis, _epsilon);
+        ARM_COMPUTE_LOG("Instantiating NEL2NormalizeLayer");
+    }
+
+    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+                    << " Input shape: " << input->info()->tensor_shape()
+                    << " Output shape: " << output->info()->tensor_shape()
+                    << std::endl);
+
+    return func;
+}
diff --git a/src/graph/nodes/NormalizationLayer.cpp b/src/graph/nodes/NormalizationLayer.cpp
new file mode 100644
index 0000000..47f0891
--- /dev/null
+++ b/src/graph/nodes/NormalizationLayer.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/NormalizationLayer.h"
+
+#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+using namespace arm_compute::graph;
+
+namespace
+{
+template <typename NormalizationType, typename TensorType, TargetHint target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
+{
+    auto norm = arm_compute::support::cpp14::make_unique<NormalizationType>();
+    norm->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(output),
+        norm_info);
+
+    return std::move(norm);
+}
+
+template <TargetHint                    target_hint>
+std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info);
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
+{
+    return instantiate_function<arm_compute::CLNormalizationLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output, norm_info);
+}
+
+template <>
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
+{
+    return instantiate_function<arm_compute::NENormalizationLayer, arm_compute::Tensor, TargetHint::NEON>(input, output, norm_info);
+}
+} // namespace
+
+NormalizationLayer::NormalizationLayer(const NormalizationLayerInfo norm_info)
+    : _norm_info(norm_info)
+{
+}
+
+std::unique_ptr<arm_compute::IFunction> NormalizationLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+{
+    std::unique_ptr<arm_compute::IFunction> func;
+    _target_hint = ctx.hints().target_hint();
+
+    if(_target_hint == TargetHint::OPENCL)
+    {
+        func = instantiate<TargetHint::OPENCL>(input, output, _norm_info);
+        ARM_COMPUTE_LOG("Instantiating CLNormalizationLayer");
+    }
+    else
+    {
+        func = instantiate<TargetHint::NEON>(input, output, _norm_info);
+        ARM_COMPUTE_LOG("Instantiating NENormalizationLayer");
+    }
+
+    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+                    << " Input shape: " << input->info()->tensor_shape()
+                    << " Output shape: " << output->info()->tensor_shape()
+                    << " Normalization info: " << _norm_info
+                    << std::endl);
+
+    return func;
+}
diff --git a/src/graph/nodes/PoolingLayer.cpp b/src/graph/nodes/PoolingLayer.cpp
index f29332f..317cf4d 100644
--- a/src/graph/nodes/PoolingLayer.cpp
+++ b/src/graph/nodes/PoolingLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/graph/nodes/PoolingLayer.h"
 
+#include "arm_compute/core/Logger.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
@@ -34,7 +35,7 @@
 
 namespace
 {
-template <typename PoolingType, typename TensorType, Hint hint>
+template <typename PoolingType, typename TensorType, TargetHint target_hint>
 std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
 {
     auto pool = arm_compute::support::cpp14::make_unique<PoolingType>();
@@ -46,19 +47,19 @@
     return std::move(pool);
 }
 
-template <Hint                          hint>
+template <TargetHint                    target_hint>
 std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
 {
-    return instantiate_function<arm_compute::CLPoolingLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output, pool_info);
+    return instantiate_function<arm_compute::CLPoolingLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output, pool_info);
 }
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
 {
-    return instantiate_function<arm_compute::NEPoolingLayer, arm_compute::Tensor, Hint::NEON>(input, output, pool_info);
+    return instantiate_function<arm_compute::NEPoolingLayer, arm_compute::Tensor, TargetHint::NEON>(input, output, pool_info);
 }
 } // namespace
 
@@ -67,38 +68,26 @@
 {
 }
 
-std::unique_ptr<arm_compute::IFunction> PoolingLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> PoolingLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
 {
     std::unique_ptr<arm_compute::IFunction> func;
-    _hint   = hint;
-    _input  = input;
-    _output = output;
+    _target_hint = ctx.hints().target_hint();
 
-    if(_hint == Hint::OPENCL)
+    if(_target_hint == TargetHint::OPENCL)
     {
-        func = instantiate<Hint::OPENCL>(input, output, _pool_info);
+        func = instantiate<TargetHint::OPENCL>(input, output, _pool_info);
+        ARM_COMPUTE_LOG("Instantiating CLPoolingLayer");
     }
     else
     {
-        func = instantiate<Hint::NEON>(input, output, _pool_info);
+        func = instantiate<TargetHint::NEON>(input, output, _pool_info);
+        ARM_COMPUTE_LOG("Instantiating NEPoolingLayer");
     }
 
+    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+                    << " Input shape: " << input->info()->tensor_shape()
+                    << " Output shape: " << output->info()->tensor_shape()
+                    << " Pooling info: " << _pool_info << std::endl);
+
     return func;
 }
-
-void PoolingLayer::print_info()
-{
-    if(_hint == Hint::OPENCL)
-    {
-        std::cout << "Instantiating CLPoolingLayer";
-    }
-    else
-    {
-        std::cout << "Instantiating NEPoolingLayer";
-    }
-
-    std::cout << " Data Type: " << _input->info()->data_type()
-              << " Input shape: " << _input->info()->tensor_shape()
-              << " Output shape: " << _output->info()->tensor_shape()
-              << " Pooling info: " << _pool_info << std::endl;
-}
diff --git a/src/graph/nodes/SoftmaxLayer.cpp b/src/graph/nodes/SoftmaxLayer.cpp
index fee8897..8628244 100644
--- a/src/graph/nodes/SoftmaxLayer.cpp
+++ b/src/graph/nodes/SoftmaxLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/graph/nodes/SoftmaxLayer.h"
 
+#include "arm_compute/core/Logger.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
@@ -34,7 +35,7 @@
 
 namespace
 {
-template <typename SoftmaxType, typename TensorType, Hint hint>
+template <typename SoftmaxType, typename TensorType, TargetHint hint>
 std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output)
 {
     auto softmax = arm_compute::support::cpp14::make_unique<SoftmaxType>();
@@ -45,53 +46,42 @@
     return std::move(softmax);
 }
 
-template <Hint                          hint>
+template <TargetHint                    target_hint>
 std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output);
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::OPENCL>(ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output)
 {
-    return instantiate_function<arm_compute::CLSoftmaxLayer, arm_compute::CLTensor, Hint::OPENCL>(input, output);
+    return instantiate_function<arm_compute::CLSoftmaxLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output);
 }
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<Hint::NEON>(ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output)
 {
-    return instantiate_function<arm_compute::NESoftmaxLayer, arm_compute::Tensor, Hint::NEON>(input, output);
+    return instantiate_function<arm_compute::NESoftmaxLayer, arm_compute::Tensor, TargetHint::NEON>(input, output);
 }
 } // namespace
 
-std::unique_ptr<arm_compute::IFunction> SoftmaxLayer::instantiate_node(Hint hint, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> SoftmaxLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
 {
     std::unique_ptr<arm_compute::IFunction> func;
-    _hint   = hint;
-    _input  = input;
-    _output = output;
+    _target_hint = ctx.hints().target_hint();
 
-    if(_hint == Hint::OPENCL)
+    if(_target_hint == TargetHint::OPENCL)
     {
-        func = instantiate<Hint::OPENCL>(input, output);
+        func = instantiate<TargetHint::OPENCL>(input, output);
+        ARM_COMPUTE_LOG("Instantiating CLSoftmaxLayer");
     }
     else
     {
-        func = instantiate<Hint::NEON>(input, output);
+        func = instantiate<TargetHint::NEON>(input, output);
+        ARM_COMPUTE_LOG("Instantiating NESoftmaxLayer");
     }
 
+    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
+                    << " Input shape: " << input->info()->tensor_shape()
+                    << " Output shape: " << output->info()->tensor_shape()
+                    << std::endl);
+
     return func;
 }
-
-void SoftmaxLayer::print_info()
-{
-    if(_hint == Hint::OPENCL)
-    {
-        std::cout << "Instantiating CLSoftmaxLayer";
-    }
-    else
-    {
-        std::cout << "Instantiating NESoftmaxLayer";
-    }
-    std::cout << " Data Type: " << _input->info()->data_type()
-              << " Input shape: " << _input->info()->tensor_shape()
-              << " Output shape: " << _output->info()->tensor_shape()
-              << std::endl;
-}
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
index f42627f..89e44ca 100644
--- a/src/runtime/CL/functions/CLDepthConcatenate.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -51,6 +52,11 @@
     _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
     _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
 
+    TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
         _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 9be1df6..c27ff2f 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -162,7 +162,7 @@
     _edge_trace.configure(&_nonmax, output);
 
     // Fill border with "No edge" to stop recursion in edge trace
-    _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0);
+    _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
 
     // Allocate intermediate tensors
     _nonmax.allocator()->allocate();
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 40862fc..f34f497 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -190,8 +190,9 @@
     {
         if(_are_weights_reshaped)
         {
-            mat_weights_cols = weights_info.num_kernels();
-            mat_weights_rows = weights->info()->dimension(0) / 4 + (_has_bias ? 1 : 0);
+            const unsigned int transpose_width = 16 / input->info()->element_size();
+            mat_weights_cols                   = weights_info.num_kernels();
+            mat_weights_rows                   = weights->info()->dimension(0) / transpose_width + (_has_bias ? 1 : 0);
         }
         else
         {
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
index ddf7e90..f8ad2ab 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenate.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
@@ -48,11 +49,16 @@
     _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
     _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
 
+    TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+
     unsigned int depth_offset = 0;
     for(unsigned int i = 0; i < _num_inputs; ++i)
     {
         _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index b831a6a..52a4cc1 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -82,7 +82,7 @@
     }
 
     // Add zero padding XY
-    _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
 }
 
 void NEDirectConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
new file mode 100644
index 0000000..408eff5
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/Size2D.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
+    k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+    _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index 2e94ed5..fa8aaeb 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -36,5 +36,5 @@
     auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
-    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, 0);
+    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
 }
diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
index 3b59820..0854c9d 100644
--- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
+++ b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
@@ -38,10 +38,10 @@
 
     if(border_mode != BorderMode::UNDEFINED)
     {
-        _border_handler.configure(input, BorderSize(1), BorderMode::CONSTANT, 0);
+        _border_handler.configure(input, BorderSize(1), BorderMode::CONSTANT, static_cast<float>(0.f));
     }
     else
     {
-        _border_handler.configure(input, BorderSize(1), BorderMode::UNDEFINED, 0);
+        _border_handler.configure(input, BorderSize(1), BorderMode::UNDEFINED, static_cast<float>(0.f));
     }
 }
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 4c4e11f..f8a85b9 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -23,19 +23,36 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
+NEPoolingLayer::NEPoolingLayer()
+    : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false)
+{
+}
+
 void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
 {
+    // Check if we have Global Pooling Layer
+    _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size()) && (input->info()->dimension(1) == pool_info.pool_size());
+
     // Configure pooling kernel
-    auto k = arm_compute::support::cpp14::make_unique<NEPoolingLayerKernel>();
-    k->configure(input, output, pool_info);
-    _kernel = std::move(k);
+    _pooling_layer_kernel.configure(input, output, pool_info);
 
     // Configure border depending on operation required
     BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0));
+    _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, PixelValue(static_cast<float>(0.f)));
 }
+
+void NEPoolingLayer::run()
+{
+    // Fill border
+    NEScheduler::get().schedule(&_border_handler, Window::DimY);
+
+    // Run pooling layer
+    NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 45c3e5d..f1a9145 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -74,7 +74,7 @@
     // Configure fill border kernel
     BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
     BorderMode fill_border_mode = reduction_operation_border_mode(op);
-    _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(0));
+    _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(static_cast<float>(0.f)));
 }
 
 void NEReductionOperation::run()
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index be81641..1dd2511 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -51,32 +51,34 @@
 void OMPScheduler::set_num_threads(unsigned int num_threads)
 {
     const unsigned int num_cores = omp_get_max_threads();
-    _num_threads                 = num_threads == 0 ? num_cores : num_threads;
+    _num_threads                 = (num_threads == 0) ? num_cores : num_threads;
 }
 
 void OMPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
+    ThreadInfo info;
+    info.cpu_info = _info;
+
     const Window      &max_window     = kernel->window();
     const unsigned int num_iterations = max_window.num_iterations(split_dimension);
-    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+    info.num_threads                  = std::min(num_iterations, _num_threads);
 
-    if(!kernel->is_parallelisable() || 1 == num_threads)
+    if(!kernel->is_parallelisable() || info.num_threads == 1)
     {
-        kernel->run(max_window);
+        kernel->run(max_window, info);
     }
     else
     {
-        #pragma omp parallel num_threads(num_threads)
+        #pragma omp parallel num_threads(info.num_threads)
         {
             #pragma omp for
-            for(unsigned int t = 0; t < num_threads; ++t)
+            for(int t = 0; t < info.num_threads; ++t)
             {
-                Window win = max_window.split_window(split_dimension, t, num_threads);
-                win.set_thread_id(t);
-                win.set_num_threads(num_threads);
-                kernel->run(win);
+                Window win     = max_window.split_window(split_dimension, t, info.num_threads);
+                info.thread_id = t;
+                kernel->run(win, info);
             }
         }
     }
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
index 1b06117..81de782 100644
--- a/src/runtime/Utils.cpp
+++ b/src/runtime/Utils.cpp
@@ -28,6 +28,10 @@
 
 using namespace arm_compute;
 
+static const std::string information =
+#include "arm_compute_version.embed"
+    ;
+
 const std::string &arm_compute::string_from_scheduler_type(Scheduler::Type t)
 {
     static std::map<Scheduler::Type, const std::string> scheduler_type_map =