arm_compute v18.11
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index 5233ff4..46a2d80 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -36,26 +36,46 @@
 
 CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
     : _concat_kernels_vector(),
+      _concat_x2_kernel(),
+      _concat_x4_kernel(),
       _num_inputs(0)
 {
 }
 
 Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
 {
+    const unsigned int num_inputs = inputs_vector.size();
+
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
 
     // Output auto inizialitation if not yet initialized
     TensorInfo  tmp_output_info = *output->clone();
     TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
-    unsigned int width_offset = 0;
-    for(const auto &input : inputs_vector)
+    switch(num_inputs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
-        width_offset += input->dimension(0);
+        case 2:
+            // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], &tmp_output_info));
+            break;
+        case 4:
+            // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], &tmp_output_info));
+            break;
+        default:
+            unsigned int width_offset = 0;
+            // Validate generic case of WidthConcatenate kernel
+            for(const auto &input : inputs_vector)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+                ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
+                width_offset += input->dimension(0);
+            }
+            break;
     }
 
     return Status{};
@@ -74,16 +94,30 @@
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+
     ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
 
-    unsigned int width_offset = 0;
-
-    _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
-
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    switch(_num_inputs)
     {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
-        width_offset += inputs_vector.at(i)->info()->dimension(0);
+        case 2:
+            // Configure WidthConcatenate2Tensors kernel
+            _concat_x2_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), output);
+            break;
+        case 4:
+            // Configure WidthConcatenate4Tensors kernel
+            _concat_x4_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
+            break;
+        default:
+            // Configure generic case WidthConcatenate kernels
+            _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+
+            unsigned int width_offset = 0;
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
+                width_offset += inputs_vector.at(i)->info()->dimension(0);
+            }
+            break;
     }
 }
 
@@ -91,8 +125,19 @@
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    for(unsigned i = 0; i < _num_inputs; i++)
+    switch(_num_inputs)
     {
-        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+        case 2:
+            CLScheduler::get().enqueue(_concat_x2_kernel, true);
+            break;
+        case 4:
+            CLScheduler::get().enqueue(_concat_x4_kernel, true);
+            break;
+        default:
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+            }
+            break;
     }
 }