arm_compute v18.05
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 95fcf88..0a977ad 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -37,11 +37,11 @@
 
 NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
     : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
-      _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false)
+      _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true)
 {
 }
 
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -52,30 +52,39 @@
     _has_bias     = biases != nullptr;
     _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
                                                                                           conv_info,
-                                                                                          input->info()->data_type());
+                                                                                          input->info()->data_type(),
+                                                                                          depth_multiplier,
+                                                                                          input->info()->data_layout());
     _are_weights_reshaped = false;
+    _is_nchw              = input->info()->data_layout() == DataLayout::NCHW;
+
+    ARM_COMPUTE_ERROR_ON(!_is_optimized && !_is_nchw);
 
     if(_is_optimized)
     {
-        // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+        if(_is_nchw)
+        {
+            // Configure the function to transform the input tensor from NCHW -> NHWC
+            _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
 
-        // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+            // Configure the function to transform the weights tensor from IHW -> HWI
+            _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
 
-        // Configure optimized depthwise
-        _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, DataLayout::NHWC);
+            // Configure optimized depthwise
+            _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, depth_multiplier, DataLayout::NHWC);
 
-        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+            // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+            _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
 
-        // Allocate tensors
-        _input_nhwc.allocator()->allocate();
-        _weights_hwio.allocator()->allocate();
-        _output_nhwc.allocator()->allocate();
-
-        // Create convolver (deferred)
-        _dwc_kernel.generate_convolver();
+            // Allocate tensors
+            _input_nhwc.allocator()->allocate();
+            _weights_hwio.allocator()->allocate();
+            _output_nhwc.allocator()->allocate();
+        }
+        else
+        {
+            _dwc_kernel.configure(input, weights, output, conv_info, depth_multiplier, DataLayout::NHWC);
+        }
     }
     else
     {
@@ -88,7 +97,7 @@
         }
 
         // Configure depthwise convolution kernel
-        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
+        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
 
         // Configure border handler
         _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
@@ -116,8 +125,15 @@
 
 void NEDepthwiseConvolutionLayer3x3::run()
 {
+    if(_is_first_run && _is_optimized)
+    {
+        _is_first_run = false;
+        // Create convolver (deferred)
+        _dwc_kernel.generate_convolver();
+    }
+
     // Permute weights in HWIO format if the optimized kernel will be executedd
-    if(!_are_weights_reshaped && _is_optimized)
+    if(!_are_weights_reshaped && _is_optimized && _is_nchw)
     {
         _are_weights_reshaped = true;
         _permute_weights.run();
@@ -126,8 +142,11 @@
     // Handle input
     if(_is_optimized)
     {
-        // Permute input to NHWC format execution
-        _permute_input.run();
+        if(_is_nchw)
+        {
+            // Permute input to NHWC format execution
+            _permute_input.run();
+        }
     }
     else
     {
@@ -139,7 +158,7 @@
     NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
 
     // Permute output to ACL's native NCHW format in case of NHWC execution
-    if(_is_optimized)
+    if(_is_optimized && _is_nchw)
     {
         _permute_output.run();
     }
@@ -153,31 +172,37 @@
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != weights->info()->dimension(2));
 
     const size_t weights_w = weights->info()->dimension(0);
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_first_run     = true;
+    _original_weights = weights;
 
     // Should bias be appended ?
     bool append_bias = (biases != nullptr) && !_is_quantized;
 
     // Calculate output shape
-    TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     // Output width and height
-    const unsigned int conv_w = dwc_output_shape.x();
-    const unsigned int conv_h = dwc_output_shape.y();
+    const unsigned int conv_w = output_shape.x();
+    const unsigned int conv_h = output_shape.y();
 
     // Set up intermediate tensors
     const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
@@ -189,7 +214,7 @@
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
     _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -204,7 +229,7 @@
     shape_v2mm_out.set(2, 1);
     _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
     _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
     _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
 
     // Output staged configuration
@@ -241,10 +266,21 @@
 
 void NEDepthwiseConvolutionLayer::run()
 {
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+        NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+        _is_first_run = false;
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
+    }
+
     NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
-    NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
     NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
-    NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
     NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
     NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
     if(_is_quantized)