arm_compute v19.08
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index b8224d2..5d224db 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -23,11 +23,14 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
+#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
-#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
@@ -44,14 +47,35 @@
 {
 }
 
-void CLConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure(std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+    configure_internal(std::move(inputs_vector), output, axis);
+}
+
+void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+    configure_internal(std::move(inputs_vector), output, axis);
+}
+
+Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+    return validate_internal(inputs_vector, output, axis);
+}
+
+Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+    return validate_internal(inputs_vector, output, axis);
+}
+
+template <typename TensorType>
+void CLConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
     _axis       = axis;
     _num_inputs = inputs_vector.size();
 
     std::vector<ITensorInfo *> inputs_vector_info(inputs_vector.size());
-    std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](ICLTensor * t)
+    std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](TensorType * t)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(t);
         return t->info();
@@ -122,12 +146,24 @@
             }
             break;
         }
+        case 3:
+        {
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = support::cpp14::make_unique<CLBatchConcatenateLayerKernel>();
+                kernel->configure(inputs_vector.at(i), offset, output);
+                offset += inputs_vector.at(i)->info()->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Axis not supported");
     }
 }
 
-Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+template <typename TensorInfoType>
+Status CLConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
     const unsigned int num_inputs = inputs_vector.size();
@@ -182,6 +218,15 @@
             }
             break;
         }
+        case 3:
+        {
+            for(const auto &input : inputs_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(CLBatchConcatenateLayerKernel::validate(input, offset, output));
+                offset += input->dimension(axis);
+            }
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Axis not supported");
     }
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 165d523..d794cde 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -188,11 +188,17 @@
     }
     else
     {
-        if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && ( CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+        // SRGAN
+        if((input->dimension(idx_h) > 720U) && (output->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
+           && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+        {
+            return ConvolutionMethod::DIRECT;
+        }
+        if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
         {
             return ConvolutionMethod::FFT;
         }
-        if (input->dimension(idx_c) < 16)
+        if(input->dimension(idx_c) < 16)
         {
             return ConvolutionMethod::GEMM;
         }
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index c6f79d3..7aa7714 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -42,10 +42,9 @@
 }
 
 void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
+                                     const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
 
     switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
     {
@@ -70,11 +69,9 @@
 }
 
 Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                      unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
+                                      const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_UNUSED(inner_border_right, inner_border_top);
-
     switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
     {
         case DeconvolutionMethod::DIRECT:
@@ -115,18 +112,6 @@
     return DeconvolutionMethod::GEMM;
 }
 
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     const WeightsInfo &weights_info)
-{
-    configure(input, weights, bias, output, deconv_info, 0, 0, weights_info);
-}
-
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                      const WeightsInfo &weights_info)
-{
-    return CLDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, 0, 0, weights_info);
-}
-
 void CLDeconvolutionLayer::run()
 {
     prepare();
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index c66dff0..63a45aa 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -36,20 +36,18 @@
 {
 }
 
-Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const BorderSize &inner_border,
-                                              const PadStrideInfo &info)
+Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
 {
-    return CLDeconvolutionLayerUpsampleKernel::validate(input, output, inner_border, info);
+    return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info);
 }
 
-void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
-                                             const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     _output = output;
     _memset.configure(_output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
-    _upsample.configure(input, _output, inner_border, info);
+    _upsample.configure(input, _output, info);
 }
 
 void CLDeconvolutionLayerUpsample::run()
@@ -57,4 +55,4 @@
     CLScheduler::get().enqueue(_memset, false);
     CLScheduler::get().enqueue(_upsample, true);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
deleted file mode 100644
index f687e54..0000000
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLDepthConcatenateLayer::CLDepthConcatenateLayer() // NOLINT
-    : _concat_kernels_vector(),
-      _border_handlers_vector(),
-      _num_inputs(0)
-{
-}
-
-void CLDepthConcatenateLayer::configure(const std::vector<ICLTensor *> &inputs_vector, ICLTensor *output) // NOLINT
-{
-    _num_inputs = inputs_vector.size();
-
-    std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
-    }
-
-    _concat_kernels_vector.resize(_num_inputs);
-    _border_handlers_vector.resize(_num_inputs);
-
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector_info, Window::DimZ);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(CLDepthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
-    unsigned int depth_offset = 0;
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
-
-        depth_offset += inputs_vector.at(i)->info()->dimension(2);
-    }
-
-    // Set valid region from shape
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
-}
-
-Status CLDepthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimZ);
-    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
-    unsigned int depth_offset = 0;
-    for(const auto &input : inputs_vector)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, depth_offset, &tmp_output_info));
-        depth_offset += input->dimension(2);
-    }
-
-    return Status{};
-}
-
-void CLDepthConcatenateLayer::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    for(unsigned i = 0; i < _num_inputs; i++)
-    {
-        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
-        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
-    }
-}
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
new file mode 100644
index 0000000..08aef92
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDepthToSpaceLayer::CLDepthToSpaceLayer()
+    : _depth_to_space_kernel()
+{
+}
+
+void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+{
+    _depth_to_space_kernel.configure(input, output, block_shape);
+}
+
+Status CLDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+{
+    return CLDepthToSpaceLayerKernel::validate(input, output, block_shape);
+}
+
+void CLDepthToSpaceLayer::run()
+{
+    CLScheduler::get().enqueue(_depth_to_space_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 97b0a01..f01b58a 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -90,6 +90,7 @@
         // Configure the function to transform the weights tensor from HWI -> IHW
         _permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
         _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
 
         input_to_use   = &_permuted_input;
         weights_to_use = &_permuted_weights;
@@ -130,7 +131,7 @@
     PixelValue &&zero_value(0.f);
     if(is_data_type_quantized_asymmetric(input->info()->data_type()))
     {
-        zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
+        zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
     }
     _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
 }
@@ -141,9 +142,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
 
+    const bool                      is_quantized           = is_data_type_quantized_asymmetric(input->data_type());
     const bool                      is_nhwc                = input->data_layout() == DataLayout::NHWC;
     const bool                      needs_permute          = is_nhwc && (depth_multiplier > 1);
-    const bool                      needs_weights_reshape  = is_nhwc && (depth_multiplier == 1);
+    const bool                      needs_weights_reshape  = is_nhwc && (depth_multiplier == 1) && is_quantized;
     const bool                      is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
     const bool                      is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
     const bool                      is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
@@ -151,6 +153,17 @@
     info.c0        = 4;
     info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
 
+    if(is_quantized)
+    {
+        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform();
+
+        const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+        ARM_COMPUTE_UNUSED(multiplier);
+        ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
+    }
+
     if(needs_permute)
     {
         TensorShape permuted_input_shape   = input->tensor_shape();
@@ -176,7 +189,10 @@
             ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
                                                                                            act_info, dilation));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
+        }
     }
     else
     {
@@ -288,6 +304,10 @@
         const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
         const size_t conv_size  = conv_w * conv_h;
 
+        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+
         // Im2Col configuration
         TensorShape shape_im2col = input->info()->tensor_shape();
         shape_im2col.set(0, patch_size);
@@ -319,11 +339,11 @@
         // Output staged configuration
         if(_is_quantized)
         {
-            const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+            const UniformQuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info;
 
-            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-            int   output_multiplier;
-            int   output_shift;
+            int         output_multiplier = 0;
+            int         output_shift      = 0;
+            const float multiplier        = iq_info.scale * wq_info.scale / output_quant_info.scale;
             quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
             _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
             _output_reshaped.allocator()->allocate();
@@ -334,8 +354,8 @@
         PixelValue zero_w(static_cast<int32_t>(0));
         if(_is_quantized)
         {
-            zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
-            zero_w  = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+            zero_in = PixelValue(static_cast<int32_t>(iq_info.offset));
+            zero_w  = PixelValue(static_cast<int32_t>(wq_info.offset));
         }
         BorderSize border_size = _v2mm_kernel.border_size();
         _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
@@ -368,7 +388,7 @@
 
     const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
 
-    if(can_run_optimised_3x3_kernel)
+    if(!can_run_optimised_3x3_kernel)
     {
         const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
@@ -410,6 +430,13 @@
 
         if(is_quantized)
         {
+            const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+            const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+            const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform();
+
+            const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+            ARM_COMPUTE_UNUSED(multiplier);
+            ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
             ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
         }
 
@@ -421,7 +448,7 @@
     }
     else
     {
-        CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation));
     }
     return Status{};
 }
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index c451bd4..bfc6ff1 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@
     PixelValue &&zero_value(0.f);
     if(is_data_type_quantized_asymmetric(input->info()->data_type()))
     {
-        zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
+        zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
     }
     _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
 
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 6e14e26..c1a39ef 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -55,7 +55,6 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
     const DataLayout data_layout = input->data_layout();
 
     const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -95,11 +94,11 @@
 
     unsigned int        padx            = 0;
     unsigned int        pady            = 0;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, 0, 0, out_dims, padx, pady);
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, padx, pady);
     TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(), info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
 
     return Status{};
@@ -141,7 +140,7 @@
     // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
     unsigned int      padx            = 0;
     unsigned int      pady            = 0;
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, 0, 0, out_dims, padx, pady);
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, padx, pady);
 
     TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
     scale_out_info.set_data_layout(data_layout);
@@ -149,7 +148,7 @@
 
     // configure scale function
     const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
-    _scale_f.configure(input, &_scaled_output, BorderSize(), upsample_info);
+    _scale_f.configure(input, &_scaled_output, upsample_info);
 
     // Setup the function to convolve the upscaled output
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
index b7e9a68..b9ebf69 100644
--- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,4 +51,59 @@
 {
     return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP);
 }
+
+void CLNegLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(input, output, ElementWiseUnary::NEG);
+    _kernel = std::move(k);
+}
+Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::NEG);
+}
+
+void CLSinLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(input, output, ElementWiseUnary::SIN);
+    _kernel = std::move(k);
+}
+Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::SIN);
+}
+
+void CLAbsLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(input, output, ElementWiseUnary::ABS);
+    _kernel = std::move(k);
+}
+Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ABS);
+}
+void CLLogLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(input, output, ElementWiseUnary::LOG);
+    _kernel = std::move(k);
+}
+Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::LOG);
+}
+
+void CLRoundLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(input, output, ElementWiseUnary::ROUND);
+    _kernel = std::move(k);
+}
+Status CLRoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ROUND);
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 28f4b13..8317e0d 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "support/ToolchainSupport.h"
-#include <arm_compute/runtime/CL/functions/CLElementwiseOperations.h>
 
 #include <utility>
 
@@ -124,4 +125,18 @@
 {
     return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
 }
+
+void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::POWER, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::POWER, input1, input2, output);
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 7b9229c..c5da649 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -41,10 +41,13 @@
 {
     if(is_data_type_quantized_asymmetric(input.data_type()))
     {
+        const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
-        const QuantizationInfo input_quantization_info(input.quantization_info().scale, -input.quantization_info().offset);
-        const QuantizationInfo weights_quantization_info(weights.quantization_info().scale, -weights.quantization_info().offset);
+        const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset);
+        const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
 
         // Validate gemmlowp function
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
@@ -88,8 +91,8 @@
         const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
         const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
 
-        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
-        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         // Configure gemmlowp function
         _mm_gemmlowp.configure(input, weights, nullptr, output);
@@ -230,11 +233,15 @@
     // Configure output stage for asymmetric quantized types
     if(_is_quantized)
     {
-        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
+        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+
+        float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier;
         int   output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
+        _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, oq_info.offset);
         _gemmlowp_output.allocator()->allocate();
     }
 }
@@ -324,6 +331,13 @@
     // Validate output stage for asymmetric quantized types
     if(is_quantized)
     {
+        const UniformQuantizationInfo iq_info    = input->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info    = weights->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info    = output->quantization_info().uniform();
+        const float                   multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+
+        ARM_COMPUTE_UNUSED(multiplier);
+        ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output));
     }
 
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 32e4678..72dd27e 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,20 +36,20 @@
 {
 }
 
-void CLFuseBatchNormalization::configure(const ICLTensor *conv_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
+void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
                                          ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                         const ICLTensor *conv_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                         float epsilon)
+                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
+                                         float epsilon, FuseBatchNormalizationType fbn_type)
 {
-    _fuse_bn_kernel.configure(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+    _fuse_bn_kernel.configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-Status CLFuseBatchNormalization::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
                                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                          const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                          float epsilon)
+                                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+                                          float epsilon, FuseBatchNormalizationType fbn_type)
 {
-    return CLFuseBatchNormalizationKernel::validate(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+    return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 void CLFuseBatchNormalization::run()
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 492709f..e78395f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -30,10 +30,12 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
@@ -46,7 +48,6 @@
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
       _mm_kernel(),
-      _ma_kernel(),
       _reshape_lhs_kernel(),
       _reshape_rhs_kernel(),
       _mm_reshaped_kernel(),
@@ -54,7 +55,6 @@
       _tmp_a(),
       _tmp_b(),
       _original_b(nullptr),
-      _run_addition(false),
       _reshape_b_only_on_first_run(false),
       _is_prepared(false),
       _gemm_type(GEMMType::NATIVE)
@@ -116,10 +116,10 @@
     // Set the target for the kernels
     _mm_kernel.set_target(gpu_target);
 
-    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d());
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());
 
     // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision());
+    _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
 
     // Tune kernel statically
     CLScheduler::get().tune_kernel_static(_mm_kernel);
@@ -160,7 +160,7 @@
     lhs_info.interleave = true;
     lhs_info.transpose  = true;
 
-    GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
+    GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
 
     _memory_group.manage(&_tmp_a);
     if(!_reshape_b_only_on_first_run)
@@ -175,7 +175,7 @@
     _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
 
     // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision());
+    _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
 
     CLScheduler::get().tune_kernel_static(_mm_kernel);
 
@@ -189,10 +189,6 @@
 
 void CLGEMM::configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_ERROR_ON(c != nullptr);
-    ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_UNUSED(c);
-
     DataType           data_type               = a->info()->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
@@ -201,13 +197,21 @@
     const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
     const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
     const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               broadcast_bias          = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = false;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
 
     // Set the target for the kernels
     _reshape_lhs_kernel.set_target(gpu_target);
     _mm_kernel.set_target(gpu_target);
 
-    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, false);
-
     // Manage intermediate buffers
     _memory_group.manage(&_tmp_a);
     if(!_reshape_b_only_on_first_run)
@@ -230,7 +234,7 @@
     _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+    _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
     // Allocate intermediate tensors
     _tmp_a.allocator()->allocate();
@@ -242,10 +246,6 @@
 
 void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_ERROR_ON(c != nullptr);
-    ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_UNUSED(c);
-
     DataType           data_type               = a->info()->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
@@ -254,12 +254,20 @@
     const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
     const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
     const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               broadcast_bias          = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
 
     // Set the target for the kernels
     _mm_kernel.set_target(gpu_target);
 
-    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
-
     // Manage intermediate buffers
     if(!_reshape_b_only_on_first_run)
     {
@@ -279,7 +287,7 @@
     _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+    _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
     if(!_reshape_b_only_on_first_run)
     {
@@ -299,21 +307,12 @@
     const unsigned int n                       = b->dimension(0);
     const unsigned int k                       = a->dimension(0);
     const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         add_c                   = (beta != 0.f && c != nullptr);
-    const bool         is_beta_one             = std::abs(1.0f - beta) < 0.00001f;
-    const bool         fuse_add                = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias());
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
-                                                                     false, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
-
-    if(add_c && !fuse_add)
-    {
-        // Validate matrix addition kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta,
+                                                                     false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
 
     return Status{};
 }
@@ -334,9 +333,6 @@
     int                mult_transpose1xW_width   = 1;
     int                mult_interleave4x4_height = 1;
     const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-    const bool         add_c                     = (beta != 0.f && c != nullptr);
-    const bool         is_beta_one               = std::abs(1.0f - beta) < 0.00001f;
-    const bool         fuse_add                  = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
@@ -358,7 +354,7 @@
     lhs_info.interleave = true;
     lhs_info.transpose  = true;
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
 
     // Validate interleave kernel
     auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
@@ -369,14 +365,8 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
-                                                                     true, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
-
-    if(add_c && !fuse_add)
-    {
-        // Validate matrix addition kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta,
+                                                                     true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
 
     return Status{};
 }
@@ -398,9 +388,16 @@
     const unsigned int k                       = a->dimension(0);
     const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         add_c                   = (beta != 0.f && c != nullptr);
+    const bool         broadcast_bias          = gemm_info.broadcast_bias();
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, false);
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = false;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
 
     GEMMLHSMatrixInfo lhs_info;
     GEMMRHSMatrixInfo rhs_info;
@@ -419,13 +416,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
-
-    if(add_c)
-    {
-        // Validate matrix addition kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
@@ -446,9 +437,16 @@
     const unsigned int k                       = a->dimension(0);
     const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         add_c                   = (beta != 0.f && c != nullptr);
+    const bool         broadcast_bias          = gemm_info.broadcast_bias();
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
 
     GEMMLHSMatrixInfo lhs_info;
     GEMMRHSMatrixInfo rhs_info;
@@ -464,13 +462,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
-
-    if(add_c)
-    {
-        // Validate matrix addition kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
@@ -497,31 +489,30 @@
     // Select GEMMType
     _gemm_type = select_gemm_type(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
 
-    const bool is_gemm_v2  = (_gemm_type == GEMMType::RESHAPED_V2) || (_gemm_type == GEMMType::RESHAPED_ONLY_RHS);
-    const bool add_c       = (beta != 0.f && c != nullptr);
-    const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
-    const bool fuse_add    = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !is_gemm_v2;
+    const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
+
+    const ICLTensor *c_to_use = fuse_add_c ? c : nullptr;
 
     switch(_gemm_type)
     {
         case GEMMType::NATIVE:
         {
-            configure_native(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            configure_native(a, b, c_to_use, output, alpha, beta, gemm_info);
             break;
         }
         case GEMMType::RESHAPED_V1:
         {
-            configure_reshaped_v1(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            configure_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info);
             break;
         }
         case GEMMType::RESHAPED_V2:
         {
-            configure_reshaped_v2(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            configure_reshaped_v2(a, b, c_to_use, output, alpha, beta, gemm_info);
             break;
         }
         case GEMMType::RESHAPED_ONLY_RHS:
         {
-            configure_reshaped_only_rhs(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            configure_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info);
             break;
         }
         default:
@@ -529,13 +520,6 @@
             ARM_COMPUTE_ERROR("GEMMType not supported");
         }
     }
-
-    // Configure matrix addition kernel
-    if(add_c && !fuse_add)
-    {
-        _ma_kernel.configure(c, output, beta);
-        _run_addition = true;
-    }
 }
 
 Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
@@ -550,26 +534,30 @@
     // Select GEMMType
     GEMMType gemm_type = select_gemm_type(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run(), gpu_target);
 
+    const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
+
+    const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
+
     switch(gemm_type)
     {
         case GEMMType::NATIVE:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c, output, alpha, beta, gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c_to_use, output, alpha, beta, gemm_info));
             break;
         }
         case GEMMType::RESHAPED_V1:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c, output, alpha, beta, gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
             break;
         }
         case GEMMType::RESHAPED_V2:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c, output, alpha, beta, gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c_to_use, output, alpha, beta, gemm_info));
             break;
         }
         case GEMMType::RESHAPED_ONLY_RHS:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c, output, alpha, beta, gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));
             break;
         }
         default:
@@ -592,7 +580,7 @@
     {
         case GEMMType::NATIVE:
         {
-            CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            CLScheduler::get().enqueue(_mm_kernel, true);
             break;
         }
         case GEMMType::RESHAPED_V1:
@@ -606,7 +594,7 @@
                 CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
             }
 
-            CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            CLScheduler::get().enqueue(_mm_kernel, true);
             break;
         }
         case GEMMType::RESHAPED_V2:
@@ -620,7 +608,7 @@
                 CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
             }
 
-            CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+            CLScheduler::get().enqueue(_mm_reshaped_kernel, true);
             break;
         }
         case GEMMType::RESHAPED_ONLY_RHS:
@@ -631,7 +619,7 @@
                 CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
             }
 
-            CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, !_run_addition);
+            CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true);
             break;
         }
         default:
@@ -639,12 +627,6 @@
             ARM_COMPUTE_ERROR("GEMMType not supported");
         }
     }
-
-    // Run matrix addition kernel
-    if(_run_addition)
-    {
-        CLScheduler::get().enqueue(_ma_kernel);
-    }
 }
 
 void CLGEMM::prepare()
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 03d516f..be6be04 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -91,22 +91,27 @@
 }
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
-      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false),
-      _is_activationlayer_enabled(false), _is_prepared(false), _run_addition(true)
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(),
+      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
 {
 }
 
 void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                                          int gemm_3d_depth)
+                                          int gemm_3d_depth, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col,
-                                           _run_addition));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
 
-    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                         false, gemmlowp_output_stage);
+    const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
+                                         false,                 // is_b_reshaped
+                                         true,                  // reshape_b_only_on_first_run
+                                         gemm_3d_depth,         // depth_output_gemm3d
+                                         _skip_im2col,          // reinterpret_input_as_3d
+                                         false,                 // retain_internal_weights
+                                         gemmlowp_output_stage, // gemmlowp_output_stage
+                                         false,                 // fp_mixed_precision
+                                         true,                  // broadcast_bias
+                                         act_info);             // activation_info
 
     if(_is_quantized)
     {
@@ -115,8 +120,8 @@
         const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
         const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
 
-        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
-        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         _mm_gemmlowp.configure(input, weights, biases, output, gemm_info);
 
@@ -126,21 +131,26 @@
     }
     else
     {
-        // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
-        const bool skip_bias_in_gemm = _run_addition || !_skip_im2col;
         // Configure matrix multiply function
-        _mm_gemm.configure(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
+        _mm_gemm.configure(input, weights, biases, output, 1.0f, 1.0f, gemm_info);
     }
 }
 
 Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, bool run_addition)
+                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
-    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                         gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                         false, gemmlowp_output_stage);
+    const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
+                                         false,                 // is_b_reshaped
+                                         true,                  // reshape_b_only_on_first_run
+                                         gemm_3d_depth,         // depth_output_gemm3d
+                                         skip_im2col,           // reinterpret_input_as_3d
+                                         false,                 // retain_internal_weights
+                                         gemmlowp_output_stage, // gemmlowp_output_stage
+                                         false,                 // fp_mixed_precision
+                                         true,                  // broadcast_bias
+                                         act_info);             // activation_info
 
     if(is_quantized)
     {
@@ -151,18 +161,16 @@
 
         std::unique_ptr<ITensorInfo> input_qa   = input->clone();
         std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
-        weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+        input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         // Perform validation step on GEMMLowp
         return CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, gemm_info);
     }
     else
     {
-        // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
-        const bool skip_bias_in_gemm = run_addition || !skip_im2col;
         // Perform validation step on Matrix multiply function
-        return CLGEMM::validate(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
+        return CLGEMM::validate(input, weights, biases, output, 1.0f, 1.0f, gemm_info);
     }
 }
 
@@ -190,16 +198,18 @@
     const unsigned int kernel_width  = weights->info()->dimension(idx_width);
     const unsigned int kernel_height = weights->info()->dimension(idx_height);
 
-    _is_prepared                = weights_info.retain_internal_weights();
-    _original_weights           = weights;
-    _is_quantized               = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _data_layout                = data_layout;
-    _skip_im2col                = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-    _skip_col2im                = data_layout == DataLayout::NHWC;
-    _append_bias                = (biases != nullptr) && (!_is_quantized);
-    _is_activationlayer_enabled = act_info.enabled();
-    // In case of F16, fused bias will be used in GEMM
-    _run_addition = (_skip_im2col) && (_append_bias) && (data_type != DataType::F16);
+    const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+    const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+
+    _is_prepared      = weights_info.retain_internal_weights();
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _skip_im2col      = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    _skip_col2im      = data_layout == DataLayout::NHWC;
+
+    // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
+    _fuse_activation = true;
 
     // Set the GPU target for im2col and col2im
     _im2col_kernel.set_target(CLScheduler::get().target());
@@ -208,8 +218,6 @@
     const ICLTensor *gemm_input_to_use  = input;
     ICLTensor       *gemm_output_to_use = output;
 
-    const ICLTensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
-
     // Get parameters from conv_info
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
@@ -227,9 +235,22 @@
 
     unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels) / num_groups;
 
-    // _weights_reshaped will be auto configured in the kernel.
-    // Just append biases and do not transpose 1xW as it will be reshaped in CLGEMM
-    _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, num_groups);
+    const ICLTensor *biases_to_use = biases;
+    bool             append_bias   = false;
+
+    if(num_groups != 1 && biases != nullptr)
+    {
+        // num_groups != 1 can only be for NCHW
+        // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
+        biases_to_use = nullptr;
+        append_bias   = true;
+
+        _reshape_weights.configure(weights, biases, &_weights_reshaped, num_groups);
+    }
+    else
+    {
+        _reshape_weights.configure(weights, nullptr, &_weights_reshaped, num_groups);
+    }
 
     // Create tensor to store im2col reshaped inputs
     if(!_skip_im2col)
@@ -237,7 +258,7 @@
         _memory_group.manage(&_im2col_output);
 
         // Configure and tune im2col. im2col output shape is auto-initialized
-        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation, num_groups);
+        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
 
         // Set quantization info
         _im2col_output.info()->set_quantization_info(input->info()->quantization_info());
@@ -246,11 +267,6 @@
         // Update GEMM input
         gemm_input_to_use = &_im2col_output;
     }
-    else if(_append_bias)
-    {
-        // Configure add bias kernel
-        _add_bias_kernel.configure(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE);
-    }
 
     // Create GEMM output tensor
     if(!_skip_col2im)
@@ -281,9 +297,9 @@
     // Configure output stage for quantized case
     if(_is_quantized)
     {
-        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+        const auto output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info;
 
-        const float multiplier        = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
+        const float multiplier        = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
         int         output_multiplier = 0;
         int         output_shift      = 0;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
@@ -296,16 +312,20 @@
                                                                                    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
                                                                                  };
 
-        if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+        if(act_info.enabled())
         {
-            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
-            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+            if(supported_acts.count(act_info.activation()) != 0)
+            {
+                const int a_const_int = quantize_qasymm8(act_info.a(), output_quant_info);
+                const int b_const_int = quantize_qasymm8(act_info.b(), output_quant_info);
 
-            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
-            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
-            // If the activation layer is RELU, BOUNDED_RELU or LU_BOUNDED_RELU, we can use the GEMMLowp output stage to perform this operation
-            _is_activationlayer_enabled = false;
+                min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+                max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+            }
+            else
+            {
+                _fuse_activation = false;
+            }
         }
 
         // Set the GEMMLowp output stage info
@@ -320,7 +340,7 @@
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth);
+    configure_mm(gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
 
     if(!_skip_im2col)
     {
@@ -342,7 +362,7 @@
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
-    if(_is_activationlayer_enabled)
+    if(!_fuse_activation)
     {
         _activationlayer_function.configure(output, nullptr, act_info);
     }
@@ -379,13 +399,14 @@
     const ITensorInfo *gemm_output_to_use = output;
     const ITensorInfo *weights_to_use     = weights;
 
-    const bool is_quantized               = is_data_type_quantized_asymmetric(data_type);
-    const bool append_bias                = (biases != nullptr) && (!is_quantized);
-    const bool skip_im2col                = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-    const bool skip_col2im                = data_layout == DataLayout::NHWC;
-    bool       is_activationlayer_enabled = act_info.enabled();
-    // In case of F16, fused bias will be used in GEMM
-    const bool run_addition = (skip_im2col) && (append_bias) && (data_type != DataType::F16);
+    const bool is_quantized    = is_data_type_quantized_asymmetric(data_type);
+    const bool skip_im2col     = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    const bool skip_col2im     = data_layout == DataLayout::NHWC;
+    bool       fuse_activation = true;
+
+    const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+    const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
 
     ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -423,10 +444,26 @@
 
     unsigned int mat_weights_cols = weights->dimension(idx_kernels) / num_groups;
 
-    // Output tensor auto inizialitation if not yet initialized
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr, num_groups));
-    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, (append_bias && !skip_im2col), num_groups), 1, data_type);
-    weights_to_use        = &weights_reshaped_info;
+    const ITensorInfo *biases_to_use = biases;
+    bool               append_bias   = false;
+
+    if(num_groups != 1 && biases != nullptr)
+    {
+        // num_groups != 1 can only be for NCHW
+        // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
+        biases_to_use = nullptr;
+        append_bias   = true;
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr, num_groups));
+        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, num_groups), 1, data_type);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr, num_groups));
+        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, num_groups), 1, data_type);
+    }
+
+    weights_to_use = &weights_reshaped_info;
 
     if(!skip_im2col)
     {
@@ -440,11 +477,6 @@
         ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
         gemm_input_to_use = &im2col_reshaped_info;
     }
-    else if(run_addition)
-    {
-        // Validate add bias kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE));
-    }
 
     // Create GEMM output tensor
     if(!skip_col2im)
@@ -468,9 +500,9 @@
 
     if(is_quantized)
     {
-        const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input->quantization_info() : output->quantization_info();
+        const auto output_quant_info = (output->total_size() == 0) ? iq_info : oq_info;
 
-        const float multiplier        = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
+        const float multiplier        = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
         int         output_multiplier = 0;
         int         output_shift      = 0;
 
@@ -484,16 +516,20 @@
                                                                                    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
                                                                                  };
 
-        if(is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+        if(act_info.enabled())
         {
-            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
-            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+            if(supported_acts.count(act_info.activation()) != 0)
+            {
+                const int a_const_int = quantize_qasymm8(act_info.a(), output_quant_info);
+                const int b_const_int = quantize_qasymm8(act_info.b(), output_quant_info);
 
-            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
-            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
-
-            // If the activation layer is RELU, BOUNDED_RELU or LU_BOUNDED_RELU, we can use the GEMMLowp output stage to perform this operation
-            is_activationlayer_enabled = false;
+                min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+                max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+            }
+            else
+            {
+                fuse_activation = false;
+            }
         }
 
         // Set the GEMMLowp output stage info
@@ -507,7 +543,7 @@
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, run_addition));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, act_info));
 
     // Validate Col2Im
     if(!skip_col2im)
@@ -516,7 +552,7 @@
     }
 
     //Validate Activation Layer
-    if(is_activationlayer_enabled)
+    if(!fuse_activation)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
     }
@@ -548,19 +584,14 @@
         _mm_gemm.run();
     }
 
-    if(_run_addition)
-    {
-        CLScheduler::get().enqueue(_add_bias_kernel);
-    }
-
     // Reshape output matrix
     if(!_skip_col2im)
     {
         CLScheduler::get().enqueue(_col2im_kernel, false);
     }
 
-    //Run Activation Layer if enabled
-    if(_is_activationlayer_enabled)
+    //Run Activation Layer if we cannot fuse in GEMM
+    if(!_fuse_activation)
     {
         _activationlayer_function.run();
     }
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index bcb91e0..36a120e 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -277,11 +277,15 @@
 
     if(_is_quantized)
     {
-        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / _gemmlowp_final.info()->quantization_info().scale;
+        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = _gemmlowp_final.info()->quantization_info().uniform();
+
+        float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier(0);
         int   output_shift(0);
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, _gemmlowp_final.info()->quantization_info().offset);
+        _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_multiplier, output_shift, oq_info.offset);
         _gemmlowp_final.allocator()->allocate();
     }
 
diff --git a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
deleted file mode 100644
index 45547e4..0000000
--- a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
-
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-void CLGEMMInterleave4x4::configure(const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 049db1d..0286cb3 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
 #include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -48,7 +49,8 @@
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _mm_kernel(),
+      _mm_midgard_kernel(),
+      _mm_native_kernel(),
       _mm_reshaped_only_rhs_kernel(),
       _mtx_b_reshape_kernel(),
       _mtx_a_reduction_kernel(),
@@ -63,6 +65,7 @@
       _a_offset(0),
       _b_offset(0),
       _is_gemm_reshaped(true),
+      _is_midgard(false),
       _reshape_b_only_on_first_run(false),
       _is_prepared(false),
       _fuse_output_stage(false)
@@ -77,14 +80,16 @@
     _is_prepared                 = false;
     _original_b                  = b;
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _a_offset                    = a->info()->quantization_info().offset;
-    _b_offset                    = b->info()->quantization_info().offset;
+    _a_offset                    = a->info()->quantization_info().uniform().offset;
+    _b_offset                    = b->info()->quantization_info().uniform().offset;
 
     // Get the GPU target
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _mm_kernel.set_target(gpu_target);
+    _mm_midgard_kernel.set_target(gpu_target);
+    _mm_native_kernel.set_target(gpu_target);
+    _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
@@ -103,6 +108,7 @@
 
     // Check if we need to reshape the matrix A and matrix B
     _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
+    _is_midgard       = gpu_target == GPUTarget::MIDGARD;
 
     if(_is_gemm_reshaped)
     {
@@ -159,8 +165,19 @@
         }
         else
         {
-            // Configure matrix multiply kernel
-            _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            if(_is_midgard)
+            {
+                // Configure matrix multiply kernel
+                _mm_midgard_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            }
+            else
+            {
+                // Pick up the GEMM configuration
+                std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+                // Configure matrix multiply kernel
+                _mm_native_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            }
         }
 
         // Configure offset contribution kernel
@@ -178,8 +195,19 @@
         }
         else
         {
-            // Configure matrix multiply kernel
-            _mm_kernel.configure(matrix_a, matrix_b, output, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            if(_is_midgard)
+            {
+                // Configure matrix multiply kernel
+                _mm_midgard_kernel.configure(matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            }
+            else
+            {
+                // Pick up the GEMM configuration
+                std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+                // Configure matrix multiply kernel
+                _mm_native_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            }
         }
 
         // Configure offset contribution kernel
@@ -213,8 +241,8 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
-    int32_t a_offset = a->quantization_info().offset;
-    int32_t b_offset = b->quantization_info().offset;
+    int32_t a_offset = a->quantization_info().uniform().offset;
+    int32_t b_offset = b->quantization_info().uniform().offset;
 
     const ITensorInfo *matrix_a_info = a;
     const ITensorInfo *matrix_b_info = b;
@@ -232,6 +260,7 @@
     const unsigned int k                       = a->dimension(0);
     const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         is_midgard              = gpu_target == GPUTarget::MIDGARD;
 
     bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
 
@@ -287,9 +316,21 @@
             // Output tensor auto inizialitation if not yet initialized
             auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
 
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, false, reshape_info));
+            if(is_midgard)
+            {
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_info));
+            }
+            else
+            {
+                // Pick up the GEMM configuration
+                std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+            }
         }
+
         // Validate offset contribution kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
                                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -308,9 +349,21 @@
         }
         else
         {
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
+            if(is_midgard)
+            {
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_info));
+            }
+            else
+            {
+                // Pick up the GEMM configuration
+                std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+            }
         }
+
         if(output->total_size() != 0)
         {
             // Validate offset contribution kernel
@@ -353,7 +406,14 @@
     }
     else
     {
-        CLScheduler::get().enqueue(_mm_kernel, false);
+        if(_is_midgard)
+        {
+            CLScheduler::get().enqueue(_mm_midgard_kernel, false);
+        }
+        else
+        {
+            CLScheduler::get().enqueue(_mm_native_kernel, false);
+        }
     }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index f1282cb..020fbbe 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
@@ -72,4 +73,20 @@
 {
     return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(input, bias, output, min, max);
 }
+
+void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                                    int result_fixedpoint_multiplier, int result_shift,
+                                                                    int min, int max)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
+    k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
+    _kernel = std::move(k);
+}
+
+Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+                                                                     int min, int max)
+{
+    return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max);
+}
+
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 136cb5e..e76e4f6 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -34,25 +34,31 @@
 
 namespace arm_compute
 {
+namespace
+{
+constexpr int max_input_tensor_dim = 3;
+} // namespace
+    
 CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
 }
 
-void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
+void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon)
 {
     // Manage intermediate buffers
     _memory_group.manage(&_sumsq);
 
     // Configure kernels
-    _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
+    const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
+    _reduce_func.configure(input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
     _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
 
     // Allocate intermediate tensor
     _sumsq.allocator()->allocate();
 }
 
-Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, float epsilon)
+Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int axis, float epsilon)
 {
     TensorShape shape(input->tensor_shape());
 
@@ -61,10 +67,11 @@
     sum_sq.set_data_type(input->data_type());
     sum_sq.set_tensor_shape(shape);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
+    const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
 
     // Reduce shape on axis
-    shape.set(axis, 1);
+    shape.set(actual_axis, 1);
     sum_sq.set_tensor_shape(shape);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 4606a66..793d5ca 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -38,16 +38,18 @@
 using namespace arm_compute::misc::shape_calculator;
 
 CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _gemm_input_gate(), _transpose_input_gate(), _accum_input_gate1(), _accum_input_gate2(), _subtract_input_gate(),
-      _pixelwise_mul_input_gate(), _activation_input_gate(), _fully_connected_forget_gate(), _gemm_forget_gate(), _transpose_forget_gate(), _accum_forget_gate1(), _accum_forget_gate2(),
-      _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
-      _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
-      _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
-      _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
-      _ones_memset_kernel(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
+    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
+      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
+      _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
+      _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
+      _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
+      _ones_memset_kernel(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(),
+      _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(),
+      _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
       _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(),
-      _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false),
-      _is_prepared(false)
+      _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(),
+      _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false),
+      _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false)
 {
 }
 
@@ -66,6 +68,8 @@
                                  output_state_in, cell_state_in,
                                  scratch_buffer, output_state_out, cell_state_out, output);
 
+    _is_layer_norm_lstm = lstm_params.use_layer_norm();
+
     // Set lstm parameters
     LSTMParams<ITensorInfo> lstm_params_info;
     if(lstm_params.has_peephole_opt())
@@ -121,7 +125,7 @@
     _concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
 
     _memory_group.manage(&_forget_gate_out5);
-    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5);
+    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
     _memory_group.manage(&_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out3);
     _forget_gate_out6.allocator()->allocate();
@@ -134,7 +138,7 @@
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
         _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-        _accum_forget_gate2.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
         _forget_gate_out5.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
@@ -143,6 +147,20 @@
     {
         _forget_gate_out3.allocator()->allocate();
     }
+    if(_is_layer_norm_lstm)
+    {
+        _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _memory_group.manage(&_forget_layer_norm_out1);
+        _memory_group.manage(&_forget_layer_norm_out2);
+        _mean_std_norm_forget_gate.configure(forget_gate_out);
+        _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
+        forget_gate_out->allocator()->allocate();
+        _accum_forget_gate_bias.configure(ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _forget_layer_norm_out1.allocator()->allocate();
+        forget_gate_out = &_forget_layer_norm_out2;
+    }
     _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
@@ -177,7 +195,7 @@
         _memory_group.manage(&_input_gate_out1);
 
         _memory_group.manage(&_input_gate_out3);
-        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, lstm_params.input_gate_bias(), &_input_gate_out3);
+        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
 
         input_gate_out = &_input_gate_out3;
@@ -185,7 +203,7 @@
         {
             _memory_group.manage(&_input_gate_out4);
             _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-            _accum_input_gate2.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
             _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
@@ -194,6 +212,21 @@
         {
             _input_gate_out1.allocator()->allocate();
         }
+
+        if(_is_layer_norm_lstm)
+        {
+            _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+            _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+            _memory_group.manage(&_input_layer_norm_out1);
+            _memory_group.manage(&_input_layer_norm_out2);
+            _mean_std_norm_input_gate.configure(input_gate_out);
+            _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+            // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
+            input_gate_out->allocator()->allocate();
+            _accum_input_gate_bias.configure(ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _input_layer_norm_out1.allocator()->allocate();
+            input_gate_out = &_input_layer_norm_out2;
+        }
         _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
@@ -207,7 +240,7 @@
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(input, input_to_cell_weights, cell_bias, &_cell_state_out1);
+    _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
     _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
     _memory_group.manage(&_cell_state_out3);
@@ -215,10 +248,25 @@
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
     _accum_cell_state1.configure(ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
-    _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
+    CLTensor *cell_state_out_ptr = &_cell_state_out4;
+    if(_is_layer_norm_lstm)
+    {
+        _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _memory_group.manage(&_cell_layer_norm_out1);
+        _memory_group.manage(&_cell_layer_norm_out2);
+        _mean_std_norm_cell_gate.configure(cell_state_out_ptr);
+        _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
+        cell_state_out_ptr->allocator()->allocate();
+        _accum_cell_gate_bias.configure(ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _cell_layer_norm_out1.allocator()->allocate();
+        cell_state_out_ptr = &_cell_layer_norm_out2;
+    }
+    _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _cell_state_out4.allocator()->allocate();
+    _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    cell_state_out_ptr->allocator()->allocate();
     _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     _accum_cell_state2.configure(ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
@@ -247,7 +295,7 @@
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
-    _fully_connected_output.configure(&_forget_gate_out2, &_output2, output_gate_bias, &_output4);
+    _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
 
     _output2.allocator()->allocate();
     _forget_gate_out2.allocator()->allocate();
@@ -259,7 +307,7 @@
 
         _memory_group.manage(&_output3);
         _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-        _accum_output2.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+        _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
         _output4.allocator()->allocate();
         output_gate_out = &_output1;
 
@@ -270,6 +318,20 @@
     {
         _output1.allocator()->allocate();
     }
+    if(_is_layer_norm_lstm)
+    {
+        _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _memory_group.manage(&_output_layer_norm_out1);
+        _memory_group.manage(&_output_layer_norm_out2);
+        _mean_std_norm_output_gate.configure(output_gate_out);
+        _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
+        output_gate_out->allocator()->allocate();
+        _accum_output_gate_bias.configure(ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _output_layer_norm_out1.allocator()->allocate();
+        output_gate_out = &_output_layer_norm_out2;
+    }
     _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the output state
@@ -316,7 +378,7 @@
     scratch_inputs.emplace_back(&_cell_state_out1);
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
-    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer, Window::DimX);
     input_gate_out->allocator()->allocate();
     _cell_state_out1.allocator()->allocate();
     forget_gate_out->allocator()->allocate();
@@ -370,6 +432,31 @@
     const unsigned int num_batches = input->dimension(1);
     const unsigned int num_cells   = input_to_output_weights->dimension(1);
 
+    if(lstm_params.use_layer_norm())
+    {
+        // If CIFG is used, input layer normalization weights tensor is omitted
+        if(lstm_params.has_cifg_opt())
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_layer_norm_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights()->num_dimensions() > 1);
+            ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights()->dimension(0) != num_batches);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
+        }
+
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->dimension(0) != num_batches);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->dimension(0) != num_batches);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->dimension(0) != num_batches);
+    }
+
     // Check peephole optimization
     if(lstm_params.has_peephole_opt())
     {
@@ -389,7 +476,7 @@
     TensorInfo cell_state_tmp  = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
 
     // Validate forget gate
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
 
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
@@ -404,6 +491,13 @@
         ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
         ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
+    if(lstm_params.use_layer_norm())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
+                                                                              RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+    }
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
@@ -423,7 +517,7 @@
         TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
 
         if(lstm_params.has_peephole_opt())
         {
@@ -432,6 +526,13 @@
             ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
             ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
+
+        if(lstm_params.use_layer_norm())
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+        }
         ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
@@ -440,9 +541,16 @@
     }
 
     // Validate cell state
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
     ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if(lstm_params.use_layer_norm())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
+                                                                              RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+    }
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, nullptr, activation_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
     ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -460,7 +568,7 @@
     TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat));
     // Validate output gate tmp
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
 
     if(lstm_params.has_peephole_opt())
     {
@@ -468,6 +576,13 @@
                                                                               RoundingPolicy::TO_NEAREST_EVEN));
         ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
     }
+    if(lstm_params.use_layer_norm())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+                                                                              RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+    }
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
@@ -497,7 +612,7 @@
     inputs_vector_info_raw.push_back(&forget_gate);
     inputs_vector_info_raw.push_back(&output_gate_tmp);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer, Window::DimX));
     return Status{};
 }
 
@@ -514,7 +629,13 @@
     if(_run_peephole_opt)
     {
         CLScheduler::get().enqueue(_pixelwise_mul_forget_gate);
-        _accum_forget_gate2.run();
+        _accum_forget_gate1.run();
+    }
+    if(_is_layer_norm_lstm)
+    {
+        _mean_std_norm_forget_gate.run();
+        CLScheduler::get().enqueue(_pixelwise_mul_forget_gate_coeff);
+        CLScheduler::get().enqueue(_accum_forget_gate_bias);
     }
     CLScheduler::get().enqueue(_activation_forget_gate);
 
@@ -530,7 +651,14 @@
         if(_run_peephole_opt)
         {
             CLScheduler::get().enqueue(_pixelwise_mul_input_gate);
-            _accum_input_gate2.run();
+            _accum_input_gate1.run();
+        }
+
+        if(_is_layer_norm_lstm)
+        {
+            _mean_std_norm_input_gate.run();
+            CLScheduler::get().enqueue(_pixelwise_mul_input_gate_coeff);
+            CLScheduler::get().enqueue(_accum_input_gate_bias);
         }
         CLScheduler::get().enqueue(_activation_input_gate);
     }
@@ -539,6 +667,12 @@
     CLScheduler::get().enqueue(_transpose_cell_state);
     _gemm_cell_state1.run();
     CLScheduler::get().enqueue(_accum_cell_state1);
+    if(_is_layer_norm_lstm)
+    {
+        _mean_std_norm_cell_gate.run();
+        CLScheduler::get().enqueue(_pixelwise_mul_cell_gate_coeff);
+        CLScheduler::get().enqueue(_accum_cell_gate_bias);
+    }
     CLScheduler::get().enqueue(_activation_cell_state);
     CLScheduler::get().enqueue(_pixelwise_mul_cell_state1);
     CLScheduler::get().enqueue(_pixelwise_mul_cell_state2);
@@ -554,7 +688,13 @@
     if(_run_peephole_opt)
     {
         CLScheduler::get().enqueue(_pixelwise_mul_output_state1);
-        _accum_output2.run();
+        _accum_output1.run();
+    }
+    if(_is_layer_norm_lstm)
+    {
+        _mean_std_norm_output_gate.run();
+        CLScheduler::get().enqueue(_pixelwise_mul_output_gate_coeff);
+        CLScheduler::get().enqueue(_accum_output_gate_bias);
     }
     CLScheduler::get().enqueue(_activation_output);
 
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
new file mode 100644
index 0000000..11cf85e
--- /dev/null
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -0,0 +1,556 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+namespace
+{
+// Quantization info structures used in the LSTMQuantize layer
+const QuantizationInfo qasymm(1.f / 128.f, 128);
+const QuantizationInfo qsymm_3(8.f / 32768.f, 0);  // qsymm16 with 3 integer bit
+const QuantizationInfo qsymm_4(16.f / 32768.f, 0); // qsymm16 with 4 integer bit
+const QuantizationInfo qsymm_0(1.f / 32768.f, 0);  // qsymm16 with 0 integer bit
+} // namespace
+
+CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
+      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(),
+      _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(),
+      _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr),
+      _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr),
+      _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(),
+      _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(),
+      _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false)
+{
+}
+
+void CLLSTMLayerQuantized::configure(const ICLTensor *input,
+                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+                                                              input_to_output_weights->info(),
+                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+
+    const int input_size  = input->info()->dimension(0);
+    const int batch_size  = input->info()->dimension(1);
+    const int output_size = input_to_input_weights->info()->dimension(1);
+
+    const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
+
+    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+
+    _input_to_input_weights      = input_to_input_weights;
+    _input_to_forget_weights     = input_to_forget_weights;
+    _input_to_cell_weights       = input_to_cell_weights;
+    _input_to_output_weights     = input_to_output_weights;
+    _recurrent_to_input_weights  = recurrent_to_input_weights;
+    _recurrent_to_forget_weights = recurrent_to_forget_weights;
+    _recurrent_to_cell_weights   = recurrent_to_cell_weights;
+    _recurrent_to_output_weights = recurrent_to_output_weights;
+    _input_gate_bias             = input_gate_bias;
+    _forget_gate_bias            = forget_gate_bias;
+    _cell_bias                   = cell_bias;
+    _output_gate_bias            = output_gate_bias;
+
+    // Weights concatenation
+    std::vector<const ICLTensor *> inputs_weights_vector;
+    inputs_weights_vector.emplace_back(input_to_input_weights);
+    inputs_weights_vector.emplace_back(input_to_forget_weights);
+    inputs_weights_vector.emplace_back(input_to_cell_weights);
+    inputs_weights_vector.emplace_back(input_to_output_weights);
+
+    std::vector<const ICLTensor *> recurrent_weights_vector;
+    recurrent_weights_vector.emplace_back(recurrent_to_input_weights);
+    recurrent_weights_vector.emplace_back(recurrent_to_forget_weights);
+    recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
+    recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
+
+    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
+
+    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
+
+    std::vector<const ICLTensor *> weights_vector;
+    weights_vector.emplace_back(&_recurrent_weights);
+    weights_vector.emplace_back(&_input_weights);
+
+    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _concat_weights.configure(weights_vector, &_weights, Window::DimX);
+    _transpose_weights.configure(&_weights, &_weights_transposed);
+
+    // Input concatenation
+    std::vector<const ICLTensor *> input_vector;
+    input_vector.emplace_back(input);
+    input_vector.emplace_back(output_state_in);
+
+    _memory_group.manage(&_input);
+    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _concat_inputs.configure(input_vector, &_input, Window::DimX);
+
+    // Bias concatenation
+    std::vector<const ICLTensor *> bias_vector;
+    bias_vector.emplace_back(input_gate_bias);
+    bias_vector.emplace_back(forget_gate_bias);
+    bias_vector.emplace_back(cell_bias);
+    bias_vector.emplace_back(output_gate_bias);
+
+    _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
+    _concat_bias.configure(bias_vector, &_bias, Window::DimX);
+
+    // Invert the offset for gemmlowp
+    _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+
+    // Run gemmlowp
+    _memory_group.manage(&_output_highp);
+    _output_highp.allocator()->init(TensorInfo(TensorShape(4 * output_size, batch_size), 1, DataType::S32));
+    _gemmlowp.configure(&_input, &_weights_transposed, nullptr, &_output_highp);
+    _input.allocator()->allocate();
+
+    // Set the offset back
+    _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+
+    // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
+    _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
+
+    const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
+    int         output_multiplier = 0;
+    int         output_shift      = 0;
+
+    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+    _memory_group.manage(&_output_lowp);
+    _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+    _output_highp.allocator()->allocate();
+    _bias.allocator()->allocate();
+
+    // Get the gate tensors
+    if(batch_size > 1)
+    {
+        _memory_group.manage(&_input_gate_input);
+        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+        _memory_group.manage(&_forget_gate_input);
+        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+        _memory_group.manage(&_input_modulation_gate_input);
+        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+        _memory_group.manage(&_output_gate_input);
+        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+        _output_lowp.allocator()->allocate();
+    }
+    else
+    {
+        _memory_group.manage(&_input_gate_input);
+        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size });
+        _memory_group.manage(&_forget_gate_input);
+        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+        _memory_group.manage(&_input_modulation_gate_input);
+        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+        _memory_group.manage(&_output_gate_input);
+        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+        _output_lowp.allocator()->allocate();
+    }
+
+    // Forget gate
+    _memory_group.manage(&_forget_gate_output);
+    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_input.allocator()->allocate();
+
+    // Input gate
+    _memory_group.manage(&_input_gate_output);
+    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_input.allocator()->allocate();
+
+    // Input modulation gate equation
+    _memory_group.manage(&_input_modulation_gate_output);
+    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_input.allocator()->allocate();
+
+    // Output gate
+    _memory_group.manage(&_output_gate_output);
+    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_input.allocator()->allocate();
+
+    // Long term memory
+    _memory_group.manage(&_cell_state_tmp1);
+    _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_forget_gate_cell_state.configure(&_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _forget_gate_output.allocator()->allocate();
+
+    _memory_group.manage(&_cell_state_tmp2);
+    _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_input_gate_input_mod_gate.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _input_modulation_gate_output.allocator()->allocate();
+    _input_gate_output.allocator()->allocate();
+
+    _add_cell_state_tmps.configure(&_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+    _cell_state_tmp1.allocator()->allocate();
+    _cell_state_tmp2.allocator()->allocate();
+
+    // Short term memory
+    _memory_group.manage(&_output_state_tmp);
+    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+
+    _memory_group.manage(&_output_state_out_symm);
+    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul_output_state_tmp_output_gate.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_gate_output.allocator()->allocate();
+    _output_state_tmp.allocator()->allocate();
+
+    // Requantize the output state from QSYMM16 to QASYMM8
+    _memory_group.manage(&_output_state_out_f32);
+    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
+    _output_state_out_symm.allocator()->allocate();
+
+    _quantize.configure(&_output_state_out_f32, output_state_out);
+    _output_state_out_f32.allocator()->allocate();
+}
+
+Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
+                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
+                                        output_state_in, cell_state_out, output_state_out);
+
+    const int input_size  = input->dimension(0);
+    const int batch_size  = input->dimension(1);
+    const int output_size = input_to_input_weights->dimension(1);
+
+    // Dimensionality checks
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_input_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
+
+    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
+    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+
+    // Shape checks
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
+
+    // Data type checks
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
+
+    // Quantization checks
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
+
+    // Validate internal functions
+    // _concat_input_weights
+    std::vector<const ITensorInfo *> inputs_weights_vector;
+    inputs_weights_vector.emplace_back(input_to_input_weights);
+    inputs_weights_vector.emplace_back(input_to_forget_weights);
+    inputs_weights_vector.emplace_back(input_to_cell_weights);
+    inputs_weights_vector.emplace_back(input_to_output_weights);
+    const QuantizationInfo qweights = input_to_input_weights->quantization_info(); // Weights quantization
+    const TensorInfo       input_weights(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_weights_vector, &input_weights, Window::DimY));
+
+    // _concat_recurrent_weights
+    std::vector<const ITensorInfo *> recurrent_weights_vector;
+    recurrent_weights_vector.emplace_back(recurrent_to_input_weights);
+    recurrent_weights_vector.emplace_back(recurrent_to_forget_weights);
+    recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
+    recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
+    const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+
+    // _concat_weights
+    std::vector<const ITensorInfo *> weights_vector;
+    weights_vector.emplace_back(&recurrent_weights);
+    weights_vector.emplace_back(&input_weights);
+    const TensorInfo weights(TensorShape(input_size + output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
+    // _transpose_weights
+    const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
+    TensorInfo        weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed));
+
+    // _concat_inputs
+    std::vector<const ITensorInfo *> input_vector;
+    input_vector.emplace_back(input);
+    input_vector.emplace_back(output_state_in);
+    TensorInfo input_concatenated(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(input_vector, &input_concatenated, Window::DimX));
+
+    // _concat_bias
+    std::vector<const ITensorInfo *> bias_vector;
+    bias_vector.emplace_back(input_gate_bias);
+    bias_vector.emplace_back(forget_gate_bias);
+    bias_vector.emplace_back(cell_bias);
+    bias_vector.emplace_back(output_gate_bias);
+
+    const TensorInfo bias_concatenated(TensorShape(4 * output_size), 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(bias_vector, &bias_concatenated, Window::DimX));
+
+    // Invert the offset for gemmlowp
+    input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
+    weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+
+    // _gemmlowp
+    const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+
+    // Set the offset back
+    input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
+    weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+
+    // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
+    const TensorInfo output_lowp(output_highp.tensor_shape(), 1, DataType::QSYMM16, qsymm_3);
+
+    const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
+    ARM_COMPUTE_UNUSED(multiplier);
+    ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
+    // _output_stage
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
+
+    TensorInfo input_gate_input;
+    TensorInfo forget_gate_input;
+    TensorInfo input_modulation_gate_input;
+    TensorInfo output_gate_input;
+
+    if(batch_size > 1)
+    {
+        // _slice_input_tensor
+        input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+        // _slice_forget_tensor
+        forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+        // _slice_cell_tensor
+        input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+        // _slice_output_tensor
+        output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+    }
+    else
+    {
+        // _slice_input_tensor
+        input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+        // _slice_forget_tensor
+        forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+        // _slice_cell_tensor
+        input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+        // _slice_output_tensor
+        output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+    }
+
+    // _sigmoid_forget_gate
+    const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    // _sigmoid_input_gate
+    const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    // _tanh_modulation_gate
+    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    // _sigmoid_output_gate
+    const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+    // _mul_forget_gate_cell_state
+    const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+    // _mul_input_gate_input_mod_gate
+    const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+    // _add_cell_state_tmps
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+
+    // _tanh_modulation_gate
+    const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+
+    // _mul_output_state_tmp_output_gate
+    const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+    // _dequantize
+    const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&output_state_out_symm, &output_state_out_f32));
+
+    // _quantize
+    ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out));
+
+    if(cell_state_out->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
+    }
+
+    if(output_state_out->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_out);
+    }
+
+    return Status{};
+}
+
+void CLLSTMLayerQuantized::run()
+{
+    prepare();
+
+    // Acquire all the temporaries
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Concat and transpose the input
+    _concat_inputs.run();
+
+    // Run gemmlowp
+    _gemmlowp.run();
+    _output_stage.run();
+
+    // Slice the results
+    _slice_input_tensor.run();
+    _slice_forget_tensor.run();
+    _slice_cell_tensor.run();
+    _slice_output_tensor.run();
+
+    // Gates
+    // Forget gate
+    _sigmoid_forget_gate.run();
+
+    // Input gate
+    _sigmoid_input_gate.run();
+
+    // Input modulation gate
+    _tanh_modulation_gate.run();
+
+    // Output gate
+    _sigmoid_output_gate.run();
+
+    // Cell state (long term memory)
+    _mul_forget_gate_cell_state.run();
+    _mul_input_gate_input_mod_gate.run();
+    _add_cell_state_tmps.run();
+
+    // Output state (short term memory)
+    _tanh_output_state.run();
+    _mul_output_state_tmp_output_gate.run();
+
+    // Requantize output state from QSYMM16 to QASYMM16
+    _dequantize.run();
+    _quantize.run();
+}
+
+void CLLSTMLayerQuantized::prepare()
+{
+    if(!_is_prepared)
+    {
+        _input_weights.allocator()->allocate();
+        _concat_input_weights.run();
+
+        _input_to_input_weights->mark_as_unused();
+        _input_to_forget_weights->mark_as_unused();
+        _input_to_cell_weights->mark_as_unused();
+        _input_to_output_weights->mark_as_unused();
+
+        _recurrent_weights.allocator()->allocate();
+        _concat_recurrent_weights.run();
+        _recurrent_to_input_weights->mark_as_unused();
+        _recurrent_to_forget_weights->mark_as_unused();
+        _recurrent_to_cell_weights->mark_as_unused();
+        _recurrent_to_output_weights->mark_as_unused();
+
+        _weights.allocator()->allocate();
+        _concat_weights.run();
+
+        _input_weights.mark_as_unused();
+        _input_weights.allocator()->free();
+        _recurrent_weights.mark_as_unused();
+        _recurrent_weights.allocator()->free();
+
+        _weights_transposed.allocator()->allocate();
+        _transpose_weights.run();
+
+        _weights.mark_as_unused();
+        _weights.allocator()->free();
+
+        _bias.allocator()->allocate();
+        _concat_bias.run();
+        _input_gate_bias->mark_as_unused();
+        _forget_gate_bias->mark_as_unused();
+        _cell_bias->mark_as_unused();
+        _output_gate_bias->mark_as_unused();
+
+        _is_prepared = true;
+    }
+}
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
similarity index 64%
rename from src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
rename to src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index d054e01..802a2fc 100644
--- a/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h"
+#include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
-void CLGEMMTranspose1xW::configure(const ICLTensor *input, ICLTensor *output)
+namespace arm_compute
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMTranspose1xWKernel>();
-    k->configure(input, output);
+void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float epsilon)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLMeanStdDevNormalizationKernel>();
+    k->configure(input, output, epsilon);
     _kernel = std::move(k);
-}
\ No newline at end of file
+}
+
+Status CLMeanStdDevNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
+{
+    return CLMeanStdDevNormalizationKernel::validate(input, output, epsilon);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
new file mode 100644
index 0000000..d463ef9
--- /dev/null
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+{
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+        }
+    }
+}
+} // namespace
+
+void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::PRELU, input, alpha, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input, alpha, output);
+}
+
+Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index cbe1ce3..086017a 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,7 @@
     PixelValue pixel_value(0.f);
     if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
     {
-        pixel_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+        pixel_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
     }
     switch(input->info()->data_layout())
     {
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 9f99d2d..38f0a75 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -105,6 +105,16 @@
                 intermediate_kernel_op = ReductionOperation::PROD;
                 last_kernel_op         = ReductionOperation::PROD;
                 break;
+            case ReductionOperation::MIN:
+                first_kernel_op        = ReductionOperation::MIN;
+                intermediate_kernel_op = ReductionOperation::MIN;
+                last_kernel_op         = ReductionOperation::MIN;
+                break;
+            case ReductionOperation::MAX:
+                first_kernel_op        = ReductionOperation::MAX;
+                intermediate_kernel_op = ReductionOperation::MAX;
+                last_kernel_op         = ReductionOperation::MAX;
+                break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
         }
@@ -179,6 +189,60 @@
                 last_kernel_op         = ReductionOperation::PROD;
                 pixelValue             = PixelValue(1, input->info()->data_type());
                 break;
+            case ReductionOperation::MIN:
+                first_kernel_op        = ReductionOperation::MIN;
+                intermediate_kernel_op = ReductionOperation::MIN;
+                last_kernel_op         = ReductionOperation::MIN;
+                switch(input->info()->data_type())
+                {
+                    case DataType::F32:
+                    {
+                        pixelValue = PixelValue(std::numeric_limits<float>::max());
+                        break;
+                    }
+                    case DataType::F16:
+                    {
+                        pixelValue = PixelValue(static_cast<half>(65504.0f));
+                        break;
+                    }
+                    case DataType::QASYMM8:
+                    {
+                        pixelValue = PixelValue(255, input->info()->data_type(), input->info()->quantization_info());
+                        break;
+                    }
+                    default:
+                    {
+                        ARM_COMPUTE_ERROR("Unsupported DataType");
+                    }
+                }
+                break;
+            case ReductionOperation::MAX:
+                first_kernel_op        = ReductionOperation::MAX;
+                intermediate_kernel_op = ReductionOperation::MAX;
+                last_kernel_op         = ReductionOperation::MAX;
+                switch(input->info()->data_type())
+                {
+                    case DataType::F32:
+                    {
+                        pixelValue = PixelValue(-std::numeric_limits<float>::max());
+                        break;
+                    }
+                    case DataType::F16:
+                    {
+                        pixelValue = PixelValue(static_cast<half>(-65504.0f));
+                        break;
+                    }
+                    case DataType::QASYMM8:
+                    {
+                        pixelValue = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+                        break;
+                    }
+                    default:
+                    {
+                        ARM_COMPUTE_ERROR("Unsupported DataType");
+                    }
+                }
+                break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
         }
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index a24b72e..fa6e82e 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,7 @@
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(output, PixelValue());
+        _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
     _space_to_batch_kernel.configure(input, block_shape, paddings, output);
 }
@@ -56,14 +56,14 @@
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(output, PixelValue());
+        _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
     _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
     ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
 
     return Status{};
@@ -72,7 +72,7 @@
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
                                      const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
     ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
 
     return Status{};
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
new file mode 100644
index 0000000..f02a13f
--- /dev/null
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLSpaceToDepthLayer::CLSpaceToDepthLayer()
+    : _space_to_depth_kernel()
+{
+}
+
+void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+{
+    _space_to_depth_kernel.configure(input, output, block_shape);
+}
+
+Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
+{
+    return CLSpaceToDepthLayerKernel::validate(input, output, block_shape);
+}
+
+void CLSpaceToDepthLayer::run()
+{
+    CLScheduler::get().enqueue(_space_to_depth_kernel, true);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
deleted file mode 100644
index a8667c3..0000000
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
-    : _concat_kernels_vector(),
-      _concat_x2_kernel(),
-      _concat_x4_kernel(),
-      _num_inputs(0)
-{
-}
-
-Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
-{
-    const unsigned int num_inputs = inputs_vector.size();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
-
-    // Output auto inizialitation if not yet initialized
-    TensorInfo        tmp_output_info = *output->clone();
-    const TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
-    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
-
-    switch(num_inputs)
-    {
-        case 2:
-            // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
-            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], &tmp_output_info));
-            break;
-        case 4:
-            // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
-            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], &tmp_output_info));
-            break;
-        default:
-            unsigned int width_offset = 0;
-            // Validate generic case of WidthConcatenate kernel
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-                ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
-                width_offset += input->dimension(0);
-            }
-            break;
-    }
-
-    return Status{};
-}
-
-void CLWidthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
-{
-    _num_inputs = inputs_vector.size();
-
-    std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
-    }
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, Window::DimX);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-
-    ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
-
-    switch(_num_inputs)
-    {
-        case 2:
-            // Configure WidthConcatenate2Tensors kernel
-            _concat_x2_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), output);
-            break;
-        case 4:
-            // Configure WidthConcatenate4Tensors kernel
-            _concat_x4_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
-            break;
-        default:
-            // Configure generic case WidthConcatenate kernels
-            _concat_kernels_vector.resize(_num_inputs);
-
-            unsigned int width_offset = 0;
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
-                width_offset += inputs_vector.at(i)->info()->dimension(0);
-            }
-            break;
-    }
-}
-
-void CLWidthConcatenateLayer::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    switch(_num_inputs)
-    {
-        case 2:
-            CLScheduler::get().enqueue(_concat_x2_kernel, true);
-            break;
-        case 4:
-            CLScheduler::get().enqueue(_concat_x4_kernel, true);
-            break;
-        default:
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
-            }
-            break;
-    }
-}
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index d3c3f98..a5db977 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -158,6 +158,9 @@
     const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->data_layout());
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size");
+
     // Check if the Winograd configuration requires fast math
     if(!enable_fast_math)
     {
@@ -189,13 +192,7 @@
                                                                                                                      GEMMLowpOutputStageInfo(), (input->data_type() == DataType::F16))));
 
     // Configure output transform
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info));
-
-    // Validate Activation Layer
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info, act_info));
 
     return Status{};
 }