arm_compute v17.04
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 632e470..2d7ad86 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -62,7 +62,7 @@
     TensorInfo         gradient_info;
     TensorInfo         magnitude_info;
 
-    /* Initialize images */
+    // Initialize images
     if(gradient_size < 7)
     {
         gradient_info.init(shape, Format::S16);
@@ -82,7 +82,7 @@
     _phase.allocator()->init(info);
     _nonmax.allocator()->init(info);
 
-    /* Configure/Init sobelNxN */
+    // Configure/Init sobelNxN
     if(gradient_size == 3)
     {
         auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
@@ -106,7 +106,7 @@
         ARM_COMPUTE_ERROR("Gradient size not supported\n");
     }
 
-    /* Configure gradient */
+    // Configure gradient
     if(use_fp16)
     {
         auto k = arm_compute::cpp14::make_unique<NEGradientFP16Kernel>();
@@ -120,28 +120,24 @@
         _gradient = std::move(k);
     }
 
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-
-    /* Configure non-maxima suppression */
+    // Configure non-maxima suppression
     _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
 
-    _phase.allocator()->allocate();
+    // Fill border around magnitude image as non-maxima suppression will access
+    // it. If border mode is undefined filling the border is a nop.
+    _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
 
-    if(border_mode != BorderMode::UNDEFINED)
-    {
-        /* Configure border filling for magnitude image */
-        _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), BorderMode::CONSTANT, 0);
-    }
-
-    _magnitude.allocator()->allocate();
-
-    /* Configure edge tracing */
+    // Configure edge tracing
     _edge_trace.configure(&_nonmax, output);
 
     // Fill border with "No edge" to stop recursion in edge trace
     _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0);
 
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _magnitude.allocator()->allocate();
     _nonmax.allocator()->allocate();
 }
 
@@ -150,16 +146,16 @@
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
     ARM_COMPUTE_ERROR_ON(_output == nullptr);
 
-    /* Run sobelNxN */
+    // Run sobelNxN
     _sobel->run();
 
-    /* Run gradient */
-    NEScheduler::get().multithread(_gradient.get());
-
-    /* Fill border before non-maxima suppression */
+    // Fill border before non-maxima suppression. Nop for border mode undefined.
     _border_mag_gradient.run(_border_mag_gradient.window());
 
-    /* Run non-maxima suppression */
+    // Run gradient
+    NEScheduler::get().multithread(_gradient.get());
+
+    // Run non-maxima suppression
     NEScheduler::get().multithread(&_non_max_suppr);
 
     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
@@ -168,6 +164,6 @@
     // Fill border before edge trace
     _border_edge_trace.run(_border_edge_trace.window());
 
-    /* Run edge tracing */
+    // Run edge tracing
     _edge_trace.run(_edge_trace.window());
 }
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 89ec900..27eb4bc 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -35,7 +35,7 @@
 
 NEConvolutionLayer::NEConvolutionLayer()
     : _input_im2col_kernel(), _input_interleave_kernel(), _weights_reshape_kernel(), _weights_transposed_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
-      _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _is_first_run(false), _has_bias(false), _is_fc(false)
+      _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _is_first_run(false), _has_bias(false)
 {
 }
 
@@ -64,22 +64,12 @@
     std::tie(stride_x, stride_y) = conv_info.stride();
     std::tie(pad_x, pad_y)       = conv_info.pad();
 
-    bool is_same_dimension = true;
-    // Make sure the input and weights have same low three dimensions
-    for(int i = 0; i < 3; i++)
-    {
-        is_same_dimension = (is_same_dimension) && (input->info()->dimension(i) == weights->info()->dimension(i));
-    }
-
-    // Run the fully connected path if is_same_dimension is true and conv_stride_x/conv_stride_y are 1, and conv_pad_x/conv_pad_y are 0 and skip col2im
-    _is_fc = (is_same_dimension) && ((stride_x & stride_y) == 1) && ((pad_x | pad_y) == 0);
-
     // Get convolved dimensions
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
-
     std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
                                                  stride_x, stride_y, pad_x, pad_y, conv_info.round());
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
     // Create tensor to store the reshaped weights
     const size_t      mat_weights_cols = weights->info()->dimension(3);
@@ -95,15 +85,11 @@
 
     // Create tensor to store im2col reshaped inputs
     const size_t mat_input_cols = mat_weights_rows;
-    const size_t mat_input_rows = _is_fc ? (input->info()->dimension(3)) : (conv_w * conv_h);
+    const size_t mat_input_rows = conv_w * conv_h;
     TensorShape  shape_im2col   = input->info()->tensor_shape();
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
-    if(_is_fc)
-    {
-        shape_im2col.set(3, 1);
-    }
     TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type());
     _input_im2col_reshaped.allocator()->init(info_im2col);
 
@@ -126,16 +112,8 @@
     _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
     _weights_transposed_kernel.configure(&_weights_reshaped, &_weights_transposed);
-
-    if(_is_fc)
-    {
-        _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, output, 1.0f);
-    }
-    else
-    {
-        _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, &_gemm_output, 1.0f);
-        _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
-    }
+    _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, &_gemm_output, 1.0f);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
 
     // Allocate the tensors once the all configure methods have been called
     _weights_reshaped.allocator()->allocate();
@@ -165,8 +143,5 @@
     NEScheduler::get().multithread(&_mm_kernel);
 
     // Reshape output matrix
-    if(!_is_fc)
-    {
-        NEScheduler::get().multithread(&_output_col2im_kernel);
-    }
+    NEScheduler::get().multithread(&_output_col2im_kernel);
 }
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index e67e4d6..670b4d4 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -40,14 +40,13 @@
       _border_handler(),
       _nonmax_kernel(),
       _fill_kernel(),
-      _out_border_handler_kernel(),
       _output(),
       _suppressed(),
       _non_max(false)
 {
 }
 
-void NEFastCorners::configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *const corners,
+void NEFastCorners::configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *corners,
                               BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
@@ -56,19 +55,18 @@
     ARM_COMPUTE_ERROR_ON(nullptr == corners);
     ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
 
+    _non_max = nonmax_suppression;
+
     TensorInfo tensor_info(input->info()->tensor_shape(), Format::U8);
     _output.allocator()->init(tensor_info);
-    _border_handler.configure(input, _fast_corners_kernel.border_size(), border_mode, constant_border_value);
-    /*
-        If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3, width - 3) and ywindow (3, height -3) so
-        the output image will leave the pixels on the borders unchanged. This can cause problems if Non Max Suppression is performed afterwards.
 
-        If non max sup is true && border == UNDEFINED we must set the border texels to 0 before executing the non max sup kernel
-    */
+    // If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3,
+    // width - 3) and ywindow (3, height -3) so the output image will leave the
+    // pixels on the borders unchanged. This is reflected in the valid region
+    // of the output. The non maxima suppression is only run on the valid
+    // pixels.
     _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, BorderMode::UNDEFINED == border_mode);
-
-    _output.allocator()->allocate();
-    _non_max = nonmax_suppression;
+    _border_handler.configure(input, _fast_corners_kernel.border_size(), border_mode, constant_border_value);
 
     if(!_non_max)
     {
@@ -76,26 +74,26 @@
     }
     else
     {
-        if(border_mode == BorderMode::UNDEFINED)
-        {
-            // We use this kernel to set the borders to 0 before performing non max sup
-            _out_border_handler_kernel.configure(&_output, _fast_corners_kernel.border_size(), PixelValue(static_cast<uint8_t>(0)));
-        }
-
         _suppressed.allocator()->init(tensor_info);
-        _suppressed.allocator()->allocate();
         _nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
         _fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
+
+        // Allocate intermediate tensors
+        _suppressed.allocator()->allocate();
     }
+
+    // Allocate intermediate tensors
+    _output.allocator()->allocate();
 }
 
 void NEFastCorners::run()
 {
+    _border_handler.run(_border_handler.window());
+
     NEScheduler::get().multithread(&_fast_corners_kernel);
 
     if(_non_max)
     {
-        NEScheduler::get().multithread(&_out_border_handler_kernel); // make sure inner borders are set to 0 before running non max sup kernel
         NEScheduler::get().multithread(&_nonmax_kernel);
     }
 
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index c6ef6c6..e6785b3 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -26,69 +26,234 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include <algorithm>
+#include <cmath>
+
 using namespace arm_compute;
 
 NEFullyConnectedLayer::NEFullyConnectedLayer()
-    : _conv_function(), _gemm_function(), _transpose_kernel(), _acc_biases_kernel(), _run_func(), _weights_transposed(), _is_first_run(true), _run_acc_biases(false)
+    : _im2col_kernel(), _transpose_kernel(), _transpose1xW_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _transpose_output(),
+      _transpose1xW_output(), _is_first_run(true), _transpose_weights(true), _fc_after_conv(false), _batched_fc_layer(false), _accumulate_biases(false)
 {
 }
 
-void NEFullyConnectedLayer::configure(ITensor *input, ITensor *weights, const ITensor *biases, ITensor *output)
+void NEFullyConnectedLayer::configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON((weights->info()->num_dimensions() != 2) && (weights->info()->num_dimensions() != 4));
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
 
-    // Make sure that in the fully connected layer connected to fully connected layer case, the first dimension of the weights and input are same.
-    ARM_COMPUTE_ERROR_ON((weights->info()->num_dimensions() == 2) && (input->info()->dimension(0) != weights->info()->dimension(0)));
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
-    if(weights->info()->num_dimensions() != 2)
-    {
-        _conv_function.configure(input, weights, biases, output, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::FLOOR));
-        _run_func = &NEFullyConnectedLayer::run_conv;
-        return;
-    }
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, weights->info()->dimension(1));
+    shape_im2col.set(1, input->info()->dimension(3));
+    shape_im2col.set(2, input->info()->dimension(4));
+    shape_im2col.set(3, input->info()->dimension(5));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
 
-    TensorShape shape_trans(weights->info()->dimension(1), weights->info()->dimension(0));
-    TensorInfo  tensor_info(shape_trans, 1, weights->info()->data_type());
-    _weights_transposed.allocator()->init(tensor_info);
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
 
-    _transpose_kernel.configure(weights, &_weights_transposed);
-    _gemm_function.configure(input, &_weights_transposed, nullptr, output, 1.0f, 0.0f);
+    // Initialize output tensor for transpose 1xW
+    TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
+    _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+
+    // Configure transpose 1xW kernel
+    _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+    _interleave4x4_output.allocator()->allocate();
+    _transpose1xW_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = input->info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+
+    // Initialize output tensor for transpose 1xW
+    TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
+    _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(input, &_interleave4x4_output);
+
+    // Configure transpose 1xW kernel
+    _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _interleave4x4_output.allocator()->allocate();
+    _transpose1xW_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, weights->info()->dimension(1));
+    shape_im2col.set(1, 1);
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+
+    // Allocate the output tensor for im2col once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(input, weights, output, 1.0f);
+}
+
+void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
+
+    _is_first_run      = true;
+    _transpose_weights = transpose_weights;
+    _fc_after_conv     = true;
+    _batched_fc_layer  = false;
+    _accumulate_biases = false;
+
+    const ITensor *weights_to_use = weights;
 
     if(biases != nullptr)
     {
-        _acc_biases_kernel.configure(output, biases);
-        _run_acc_biases = true;
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+
+        _accumulate_biases = true;
+
+        // Configure accumulate biases kernel
+        _accumulate_biases_kernel.configure(output, biases);
     }
 
-    _run_func = &NEFullyConnectedLayer::run_fc;
-
-    // Allocate once all the configure methods have been called
-    _weights_transposed.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::run_conv()
-{
-    _conv_function.run();
-}
-
-void NEFullyConnectedLayer::run_fc()
-{
-    if(_is_first_run)
+    // Check if we need to transpose the weights
+    if(_transpose_weights)
     {
-        _is_first_run = false;
-        NEScheduler::get().multithread(&_transpose_kernel);
+        // Initialize the output tensor for transpose
+        TensorShape shape_transposed(weights->info()->dimension(1), weights->info()->dimension(0));
+        _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, weights->info()->data_type()));
+        _transpose_kernel.configure(weights, &_transpose_output);
+
+        weights_to_use = &_transpose_output;
     }
 
-    _gemm_function.run();
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
 
-    if(_run_acc_biases)
+    // Check if we have a fully connected layer with batches
+    _batched_fc_layer = (output->info()->dimension(1) > 1);
+
+    if(_batched_fc_layer)
     {
-        NEScheduler::get().multithread(&_acc_biases_kernel);
+        _fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                                               input->info()->tensor_shape().cend(),
+                                                                               output->info()->tensor_shape().cbegin() + 1));
+
+        if(_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer with batches
+            configure_conv_fc_wb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer with batches
+            configure_fc_fc_wb(input, weights_to_use, output);
+        }
+    }
+    else
+    {
+        _fc_after_conv = (weights_to_use->info()->dimension(1) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+
+        if(_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer without batches
+            configure_conv_fc_nb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer without batches
+            configure_fc_fc_nb(input, weights_to_use, output);
+        }
+    }
+
+    // Allocate the transpose tensor if the transpose_weights flag is true and once all the configure methods have been called
+    if(_transpose_weights)
+    {
+        _transpose_output.allocator()->allocate();
     }
 }
 
 void NEFullyConnectedLayer::run()
 {
-    ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
-    (this->*_run_func)();
+    // Reshape of the weights (happens only once)
+    if(_is_first_run)
+    {
+        _is_first_run = false;
+        if(_transpose_weights)
+        {
+            NEScheduler::get().multithread(&_transpose_kernel);
+        }
+        if(_batched_fc_layer)
+        {
+            NEScheduler::get().multithread(&_transpose1xW_kernel);
+        }
+    }
+
+    // Linearize input if comes from a convolutional layer
+    if(_fc_after_conv)
+    {
+        NEScheduler::get().multithread(&_im2col_kernel);
+    }
+
+    // Interleave input
+    if(_batched_fc_layer)
+    {
+        NEScheduler::get().multithread(&_interleave4x4_kernel);
+    }
+
+    // Run matrix multiply
+    NEScheduler::get().multithread(&_mm_kernel);
+
+    // Accumulate biases if provided
+    if(_accumulate_biases)
+    {
+        NEScheduler::get().multithread(&_accumulate_biases_kernel);
+    }
 }
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
new file mode 100644
index 0000000..4c77c88
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+
+using namespace arm_compute;
+
+void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
new file mode 100644
index 0000000..8cba30d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(1) * 4);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 4.0f));
+    auto k = arm_compute::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index 1c75bee..cb8296b 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -76,23 +76,22 @@
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
         tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
 
-        PyramidInfo pyramid_info;
-        pyramid_info.init(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
-
-        _tmp.init_auto_padding(pyramid_info);
-        _tmp.allocate();
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
+        _tmp.init(pyramid_info);
 
         for(unsigned int i = 0; i < num_levels - 1; ++i)
         {
-            /* Configure border */
-            _border_handler[i].configure(_pyramid->get_pyramid_level(i), 2, border_mode, PixelValue(constant_border_value));
-
             /* Configure horizontal kernel */
             _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
 
             /* Configure vertical kernel */
             _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure border */
+            _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
         }
+
+        _tmp.allocate();
     }
 }
 
@@ -140,11 +139,8 @@
         _scale_nearest = arm_compute::cpp14::make_unique<NEScaleKernel[]>(num_levels - 1);
         _offsets       = arm_compute::cpp14::make_unique<Image[]>(num_levels - 1);
 
-        PyramidInfo pyramid_info;
-        pyramid_info.init(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
-
-        _tmp.init_auto_padding(pyramid_info);
-        _tmp.allocate();
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
+        _tmp.init(pyramid_info);
 
         for(unsigned int i = 0; i < num_levels - 1; ++i)
         {
@@ -153,9 +149,7 @@
 
             /* Allocate Image for the offsets used by NEAREST interpolation */
             TensorInfo tensor_info(TensorShape(width, height), Format::S32);
-            tensor_info.auto_padding();
             _offsets[i].allocator()->init(tensor_info);
-            _offsets[i].allocator()->allocate();
 
             /* Configure gaussian 5x5 */
             _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
@@ -163,7 +157,11 @@
             /* Configure scale image kernel */
             _scale_nearest[i].configure(_tmp.get_pyramid_level(i), nullptr, nullptr, _offsets.get() + i, _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR,
                                         border_mode == BorderMode::UNDEFINED);
+
+            _offsets[i].allocator()->allocate();
         }
+
+        _tmp.allocate();
     }
 }
 
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index 8edb9cb..a5073b9 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -63,19 +63,13 @@
 
     // Allocate memory for magnitude, phase and hog space
     TensorInfo info_mag(shape_img, Format::S16);
-    info_mag.auto_padding();
     _mag.allocator()->init(info_mag);
-    _mag.allocator()->allocate();
 
     TensorInfo info_phase(shape_img, Format::U8);
-    info_phase.auto_padding();
     _phase.allocator()->init(info_phase);
-    _phase.allocator()->allocate();
 
     TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-    info_space.auto_padding();
     _hog_space.allocator()->init(info_space);
-    _hog_space.allocator()->allocate();
 
     // Initialise gradient kernel
     _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
@@ -85,6 +79,11 @@
 
     // Initialize HOG norm kernel
     _block_norm.configure(&_hog_space, output, hog->info());
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _hog_space.allocator()->allocate();
 }
 
 void NEHOGDescriptor::run()
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index c82f0af..c5b37f4 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -45,11 +45,8 @@
 
     // Allocate image memory
     TensorInfo info(shape_img, Format::S16);
-    info.auto_padding();
     _gx.allocator()->init(info);
-    _gx.allocator()->allocate();
     _gy.allocator()->init(info);
-    _gy.allocator()->allocate();
 
     // Initialise derivate kernel
     _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
@@ -67,6 +64,10 @@
         k->configure(&_gx, &_gy, output_magnitude, output_phase);
         _mag_phase = std::move(k);
     }
+
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
 }
 
 void NEHOGGradient::run()
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 4ebe80d..effa64f 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -121,14 +121,10 @@
 
     // Allocate tensors for magnitude and phase
     TensorInfo info_mag(shape_img, Format::S16);
-    info_mag.auto_padding();
     _mag.allocator()->init(info_mag);
-    _mag.allocator()->allocate();
 
     TensorInfo info_phase(shape_img, Format::U8);
-    info_phase.auto_padding();
     _phase.allocator()->init(info_phase);
-    _phase.allocator()->allocate();
 
     // Initialise gradient kernel
     _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
@@ -142,7 +138,7 @@
         const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
         const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
 
-        // Calculate number of cells along the x and y directions for the hog_space */
+        // Calculate number of cells along the x and y directions for the hog_space
         const size_t num_cells_x = width / cell.width;
         const size_t num_cells_y = height / cell.height;
 
@@ -153,9 +149,7 @@
 
         // Allocate HOG space
         TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-        info_space.auto_padding();
         _hog_space[i].allocator()->init(info_space);
-        _hog_space[i].allocator()->allocate();
 
         // Initialise orientation binning kernel
         _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
@@ -169,9 +163,7 @@
 
         // Allocate normalized HOG space
         TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
-        tensor_info.auto_padding();
         _hog_norm_space[i].allocator()->init(tensor_info);
-        _hog_norm_space[i].allocator()->allocate();
 
         // Initialize block normalization kernel
         _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
@@ -187,6 +179,20 @@
 
     // Configure non maxima suppression kernel
     _non_maxima_kernel->configure(_detection_windows, min_distance);
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        _hog_norm_space[i].allocator()->allocate();
+    }
 }
 
 void NEHOGMultiDetection::run()
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index 0f5215f..adefd47 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -55,31 +55,27 @@
     ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
 
     const TensorShape shape = input->info()->tensor_shape();
-    TensorInfo        tensor_info;
+    TensorInfo        tensor_info_gxgy;
 
-    /* Allocate memory */
     if(gradient_size < 7)
     {
-        tensor_info.init_auto_padding(shape, Format::S16);
+        tensor_info_gxgy.init(shape, Format::S16);
     }
     else
     {
-        tensor_info.init_auto_padding(shape, Format::S32);
+        tensor_info_gxgy.init(shape, Format::S32);
     }
 
-    _gx.allocator()->init(tensor_info);
-    _gx.allocator()->allocate();
-    _gy.allocator()->init(tensor_info);
-    _gy.allocator()->allocate();
+    _gx.allocator()->init(tensor_info_gxgy);
+    _gy.allocator()->init(tensor_info_gxgy);
 
-    tensor_info.init_auto_padding(shape, Format::F32);
-    _score.allocator()->init(tensor_info);
-    _score.allocator()->allocate();
-    _nonmax.allocator()->init(tensor_info);
-    _nonmax.allocator()->allocate();
+    TensorInfo tensor_info_score(shape, Format::F32);
+    _score.allocator()->init(tensor_info_score);
+    _nonmax.allocator()->init(tensor_info_score);
+
     _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
 
-    /* Set/init Sobel kernel accordingly with gradient_size */
+    // Set/init Sobel kernel accordingly with gradient_size
     switch(gradient_size)
     {
         case 3:
@@ -107,11 +103,7 @@
             ARM_COMPUTE_ERROR("Gradient size not implemented");
     }
 
-    /* Configure border filling before harris score*/
-    _border_gx.configure(&_gx, block_size / 2, border_mode, constant_border_value);
-    _border_gy.configure(&_gy, block_size / 2, border_mode, constant_border_value);
-
-    /* Normalization factor */
+    // Normalization factor
     const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
 
     if(use_fp16)
@@ -144,7 +136,7 @@
     }
     else
     {
-        /* Set/init Harris Score kernel accordingly with block_size */
+        // Set/init Harris Score kernel accordingly with block_size
         switch(block_size)
         {
             case 3:
@@ -171,38 +163,50 @@
                 break;
         }
     }
-    /* Init non-maxima suppression function */
+
+    // Configure border filling before harris score
+    _border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
+    _border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
+
+    // Init non-maxima suppression function
     _non_max_suppr.configure(&_score, &_nonmax, border_mode);
 
-    /* Init corner candidates kernel */
+    // Init corner candidates kernel
     _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
 
-    /* Init euclidean distance*/
+    // Init euclidean distance
     _sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
+
+    // Allocate once all the configure methods have been called
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _score.allocator()->allocate();
+    _nonmax.allocator()->allocate();
 }
 
 void NEHarrisCorners::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
 
-    /* Init to 0 number of corner candidates */
+    // Init to 0 number of corner candidates
     _num_corner_candidates = 0;
 
-    /* Run Sobel kernel */
+    // Run Sobel kernel
     _sobel->run();
 
-    /* Fill border before harris score kernel */
+    // Fill border before harris score kernel
     _border_gx.run(_border_gx.window());
     _border_gy.run(_border_gy.window());
 
-    /* Run harris score kernel */
+    // Run harris score kernel
     NEScheduler::get().multithread(_harris_score.get());
 
-    /* Run non-maxima suppression */
+    // Run non-maxima suppression
     _non_max_suppr.run();
 
-    /* Run corner candidate kernel */
+    // Run corner candidate kernel
     NEScheduler::get().multithread(&_candidates);
 
+    // Run sort & euclidean distance
     _sort_euclidean.run(_sort_euclidean.window());
 }
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 2065f3c..8232c79 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -45,16 +45,19 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
 
-    _gaussian_pyr_function.run(); // compute gaussian pyramid
+    // Compute Gaussian Pyramid
+    _gaussian_pyr_function.run();
 
     for(unsigned int i = 0; i < _num_levels; ++i)
     {
-        _convf[i].run(); // convolute gaussian pyramid
+        // Apply Gaussian filter to gaussian pyramid image
+        _convf[i].run();
     }
 
     for(unsigned int i = 0; i < _num_levels; ++i)
     {
-        _subf[i].run(); // compute laplacian image
+        // Compute laplacian image
+        _subf[i].run();
     }
 
     _depth_function.run();
@@ -77,10 +80,8 @@
     PyramidInfo pyramid_info;
     pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
 
-    _gauss_pyr.init_auto_padding(pyramid_info);
-    _gauss_pyr.allocate();
-    _conv_pyr.init_auto_padding(pyramid_info);
-    _conv_pyr.allocate();
+    _gauss_pyr.init(pyramid_info);
+    _conv_pyr.init(pyramid_info);
 
     // Create Gaussian Pyramid function
     _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
@@ -95,4 +96,7 @@
     }
 
     _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
+
+    _gauss_pyr.allocate();
+    _conv_pyr.allocate();
 }
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index dc59e14..36ac4a7 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -57,8 +57,8 @@
     // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
     PyramidInfo pyramid_info;
     pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
-    _tmp_pyr.init_auto_padding(pyramid_info);
-    _tmp_pyr.allocate();
+
+    _tmp_pyr.init(pyramid_info);
 
     // Allocate add and scale functions. Level 0 does not need to be scaled.
     _addf   = arm_compute::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
@@ -77,6 +77,8 @@
 
     // Convert level 0 from S16 to U8
     _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
+
+    _tmp_pyr.allocate();
 }
 
 void NELaplacianReconstruct::run()
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index 5297991..bd89a0b 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -87,12 +87,9 @@
         const unsigned int height_ith = new_ith_input->info()->dimension(1);
 
         TensorInfo tensor_info(TensorShape(width_ith, height_ith), Format::S16);
-        tensor_info.auto_padding();
 
         _scharr_gx[i].allocator()->init(tensor_info);
-        _scharr_gx[i].allocator()->allocate();
         _scharr_gy[i].allocator()->init(tensor_info);
-        _scharr_gy[i].allocator()->allocate();
 
         /* Init Scharr kernel */
         _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
@@ -103,6 +100,9 @@
                                      &_old_points_internal, &_new_points_internal,
                                      termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
                                      i, _num_levels, pyr_scale, border_offset);
+
+        _scharr_gx[i].allocator()->allocate();
+        _scharr_gy[i].allocator()->allocate();
     }
 }
 
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index bd8cf3a..b70f626 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -41,7 +41,7 @@
 
 namespace
 {
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_stride, size_t input_element_size)
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
 
@@ -63,7 +63,7 @@
             const int   in_xi = std::floor(in_x);
             const int   in_yi = std::floor(in_y);
 
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size + in_yi * input_stride;
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
             *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
             *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
         },
@@ -77,9 +77,8 @@
         execute_window_loop(win, [&](const Coordinates & id)
         {
             const size_t in_xi = (id.x() + 0.5f) * wr;
-            const size_t in_yi = (id.y() + 0.5f) * hr;
 
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size + in_yi * input_stride;
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
         },
         offsets_it);
     }
@@ -108,8 +107,7 @@
     const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
     const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
 
-    // Get the input stride and the input element size
-    const size_t input_stride       = input->info()->strides_in_bytes()[1];
+    // Get the element size of the input image
     const size_t input_element_size = input->info()->element_size();
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
@@ -128,7 +126,6 @@
         case InterpolationPolicy::NEAREST_NEIGHBOR:
         {
             TensorInfo tensor_info_offsets(shape, Format::S32);
-            tensor_info_offsets.auto_padding();
             _offsets.allocator()->init(tensor_info_offsets);
 
             k->configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
@@ -137,7 +134,7 @@
             _offsets.allocator()->allocate();
 
             // Pre-compute offsets for nearest interpolation
-            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_stride, input_element_size);
+            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size);
             break;
         }
         case InterpolationPolicy::BILINEAR:
@@ -145,9 +142,6 @@
             TensorInfo tensor_info_offsets(shape, Format::S32);
             TensorInfo tensor_info_dxdy(shape, Format::F32);
 
-            tensor_info_offsets.auto_padding();
-            tensor_info_dxdy.auto_padding();
-
             _offsets.allocator()->init(tensor_info_offsets);
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
@@ -160,7 +154,7 @@
             _dy.allocator()->allocate();
 
             // Pre-compute dx, dy and offsets for bilinear interpolation
-            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_stride, input_element_size);
+            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size);
             break;
         }
         case InterpolationPolicy::AREA:
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 3949529..55d4d3a 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -32,7 +32,7 @@
 using namespace arm_compute;
 
 NESoftmaxLayer::NESoftmaxLayer()
-    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
+    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _fill_border_kernel_sum(), _max(), _sum(), _tmp()
 {
 }
 
@@ -42,31 +42,34 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
 
     // Create intermediate tensors shapes
-    TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type());
-    tensor_info_tmp.auto_padding();
-    _tmp.allocator()->init(tensor_info_tmp);
-    _tmp.allocator()->allocate();
+    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
 
     TensorShape shape = input->info()->tensor_shape();
     shape.set(0, 1);
     TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
-    tensor_info_max_sum.auto_padding();
     _max.allocator()->init(tensor_info_max_sum);
-    _max.allocator()->allocate();
     _sum.allocator()->init(tensor_info_max_sum);
-    _sum.allocator()->allocate();
 
     // Configure Kernels
-    _fill_border_kernel.configure(input, 3, BorderMode::CONSTANT, PixelValue(-FLT_MAX));
     _max_kernel.configure(input, &_max);
     _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
     _norm_kernel.configure(&_tmp, &_sum, output);
+    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX));
+    // Fill the border around tmp buffer with sensible negative value.
+    // This avoids exp(-FLT_MAX) which will lead to -inf and destroy the calculation of sum when input is not a multiple of processed elements
+    _fill_border_kernel_sum.configure(input, _shift_exp_sum_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-50.f));
+
+    // Allocate intermediate tensors
+    _tmp.allocator()->allocate();
+    _max.allocator()->allocate();
+    _sum.allocator()->allocate();
 }
 
 void NESoftmaxLayer::run()
 {
     NEScheduler::get().multithread(&_fill_border_kernel);
     NEScheduler::get().multithread(&_max_kernel);
+    NEScheduler::get().multithread(&_fill_border_kernel_sum);
     NEScheduler::get().multithread(&_shift_exp_sum_kernel);
     NEScheduler::get().multithread(&_norm_kernel);
 }