arm_compute v19.05
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 15cbfce..97b0a01 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -45,10 +45,18 @@
 }
 
 void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                               ActivationLayerInfo act_info)
+                                               ActivationLayerInfo act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    // idx_w and idx_h only used for validation
+    const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_UNUSED(idx_w);
+    ARM_COMPUTE_UNUSED(idx_h);
+
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
 
@@ -62,11 +70,13 @@
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = output;
 
-    const bool                      is_stride_1       = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool                      is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+    const bool is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
+    const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+
     DepthwiseConvolutionReshapeInfo info;
     info.c0        = 4;
-    info.transpose = is_stride_1 && is_dot8_supported;
+    info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
 
     if(_needs_permute)
     {
@@ -103,7 +113,7 @@
 
     // Configure kernel
     _kernel->set_target(CLScheduler::get().target());
-    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info);
+    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info, dilation);
 
     // Permute output if needed
     if(_needs_permute)
@@ -126,26 +136,26 @@
 }
 
 Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                unsigned int        depth_multiplier,
-                                                ActivationLayerInfo act_info, GPUTarget gpu_target)
+                                                unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
 
-    const bool                      is_nhwc               = input->data_layout() == DataLayout::NHWC;
-    const bool                      needs_permute         = is_nhwc && (depth_multiplier > 1);
-    const bool                      needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
-    const bool                      is_stride_1           = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool                      is_dot8_supported     = dot8_supported(CLKernelLibrary::get().get_device());
+    const bool                      is_nhwc                = input->data_layout() == DataLayout::NHWC;
+    const bool                      needs_permute          = is_nhwc && (depth_multiplier > 1);
+    const bool                      needs_weights_reshape  = is_nhwc && (depth_multiplier == 1);
+    const bool                      is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool                      is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
+    const bool                      is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
     DepthwiseConvolutionReshapeInfo info;
     info.c0        = 4;
-    info.transpose = is_stride_1 && is_dot8_supported;
+    info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
 
     if(needs_permute)
     {
         TensorShape permuted_input_shape   = input->tensor_shape();
         TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
 
         permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
         permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
@@ -155,7 +165,8 @@
         const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
         const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target,
+                                                                                       dilation));
     }
     else if(is_nhwc)
     {
@@ -163,13 +174,13 @@
         {
             auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
             ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
-                                                                                           act_info));
+                                                                                           act_info, dilation));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation));
     }
 
     return Status{};
@@ -179,7 +190,7 @@
 {
     prepare();
 
-    _memory_group.acquire();
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
     if(_needs_permute)
     {
@@ -192,8 +203,6 @@
     {
         _permute_output_to_nhwc.run();
     }
-
-    _memory_group.release();
 }
 
 void CLDepthwiseConvolutionLayer3x3::prepare()
@@ -229,7 +238,7 @@
 }
 
 void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -238,12 +247,15 @@
     const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
     const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
 
     if(bool(can_run_optimised_3x3_kernel))
     {
         auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
-        f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+        f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
         _optimised_function = std::move(f);
     }
     else
@@ -262,7 +274,7 @@
         const GPUTarget gpu_target  = CLScheduler::get().target();
 
         // Calculate output shape
-        TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+        TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
 
         // Output auto inizialitation if not yet initialized
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -283,7 +295,7 @@
         shape_im2col.set(2, weights_z);
         _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
         _im2col_kernel.set_target(gpu_target);
-        _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+        _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
         CLScheduler::get().tune_kernel_static(_im2col_kernel);
 
         // Weights reshape configuration
@@ -310,7 +322,8 @@
             const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
 
             float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-            int   output_multiplier, output_shift;
+            int   output_multiplier;
+            int   output_shift;
             quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
             _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
             _output_reshaped.allocator()->allocate();
@@ -345,11 +358,14 @@
 }
 
 Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+
     const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
 
     if(can_run_optimised_3x3_kernel)
@@ -361,7 +377,7 @@
 
         const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
         const bool         append_bias  = (biases != nullptr) && !is_quantized;
-        const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
         const size_t       weights_w    = weights->dimension(idx_w);
         const size_t       weights_h    = weights->dimension(idx_h);
         const size_t       weights_z    = weights->dimension(idx_c);
@@ -375,7 +391,7 @@
         shape_im2col.set(1, conv_size);
         shape_im2col.set(2, weights_z);
         TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
 
         const TensorShape shape_weights_reshape(patch_size, weights_z);
         TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
@@ -405,7 +421,7 @@
     }
     else
     {
-        CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+        CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation);
     }
     return Status{};
 }