arm_compute v19.08
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 97b0a01..f01b58a 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -90,6 +90,7 @@
         // Configure the function to transform the weights tensor from HWI -> IHW
         _permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
         _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
 
         input_to_use   = &_permuted_input;
         weights_to_use = &_permuted_weights;
@@ -130,7 +131,7 @@
     PixelValue &&zero_value(0.f);
     if(is_data_type_quantized_asymmetric(input->info()->data_type()))
     {
-        zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
+        zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
     }
     _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
 }
@@ -141,9 +142,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
 
+    const bool                      is_quantized           = is_data_type_quantized_asymmetric(input->data_type());
     const bool                      is_nhwc                = input->data_layout() == DataLayout::NHWC;
     const bool                      needs_permute          = is_nhwc && (depth_multiplier > 1);
-    const bool                      needs_weights_reshape  = is_nhwc && (depth_multiplier == 1);
+    const bool                      needs_weights_reshape  = is_nhwc && (depth_multiplier == 1) && is_quantized;
     const bool                      is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
     const bool                      is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
     const bool                      is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
@@ -151,6 +153,17 @@
     info.c0        = 4;
     info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
 
+    if(is_quantized)
+    {
+        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform();
+
+        const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+        ARM_COMPUTE_UNUSED(multiplier);
+        ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
+    }
+
     if(needs_permute)
     {
         TensorShape permuted_input_shape   = input->tensor_shape();
@@ -176,7 +189,10 @@
             ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
                                                                                            act_info, dilation));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
+        }
     }
     else
     {
@@ -288,6 +304,10 @@
         const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
         const size_t conv_size  = conv_w * conv_h;
 
+        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+
         // Im2Col configuration
         TensorShape shape_im2col = input->info()->tensor_shape();
         shape_im2col.set(0, patch_size);
@@ -319,11 +339,11 @@
         // Output staged configuration
         if(_is_quantized)
         {
-            const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+            const UniformQuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info;
 
-            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-            int   output_multiplier;
-            int   output_shift;
+            int         output_multiplier = 0;
+            int         output_shift      = 0;
+            const float multiplier        = iq_info.scale * wq_info.scale / output_quant_info.scale;
             quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
             _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
             _output_reshaped.allocator()->allocate();
@@ -334,8 +354,8 @@
         PixelValue zero_w(static_cast<int32_t>(0));
         if(_is_quantized)
         {
-            zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
-            zero_w  = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+            zero_in = PixelValue(static_cast<int32_t>(iq_info.offset));
+            zero_w  = PixelValue(static_cast<int32_t>(wq_info.offset));
         }
         BorderSize border_size = _v2mm_kernel.border_size();
         _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
@@ -368,7 +388,7 @@
 
     const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
 
-    if(can_run_optimised_3x3_kernel)
+    if(!can_run_optimised_3x3_kernel)
     {
         const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
@@ -410,6 +430,13 @@
 
         if(is_quantized)
         {
+            const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+            const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+            const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform();
+
+            const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+            ARM_COMPUTE_UNUSED(multiplier);
+            ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
             ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
         }
 
@@ -421,7 +448,7 @@
     }
     else
     {
-        CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation));
     }
     return Status{};
 }