arm_compute v19.02

Change-Id: I853a3ecf38f206da13c1b03640c8adf73c20477c
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
new file mode 100644
index 0000000..a6393c5
--- /dev/null
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel>();
+    k->configure(input, output, axis, op);
+    _kernel = std::move(k);
+}
+
+Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+    return CLReductionOperationKernel::validate(input, output, axis, op);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
deleted file mode 100644
index 0b05058..0000000
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticAdditionKernel>();
-    k->configure(input1, input2, output, policy);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
-}
-
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
-    return CLArithmeticAdditionKernel::validate(input1, input2, output, policy);
-}
diff --git a/src/runtime/CL/functions/CLArithmeticDivision.cpp b/src/runtime/CL/functions/CLArithmeticDivision.cpp
deleted file mode 100644
index 1c2849c..0000000
--- a/src/runtime/CL/functions/CLArithmeticDivision.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticDivision.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticDivisionKernel>();
-    k->configure(input1, input2, output);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
-}
-
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    return CLArithmeticDivisionKernel::validate(input1, input2, output);
-}
diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
deleted file mode 100644
index e661f6a..0000000
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
-    k->configure(input1, input2, output, policy);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
-}
-
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
-    return CLArithmeticSubtractionKernel::validate(input1, input2, output, policy);
-}
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
new file mode 100644
index 0000000..e0ffcdb
--- /dev/null
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
+    k->configure(input, output, policy, 0);
+    _kernel = std::move(k);
+}
+
+Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return CLDepthConvertLayerKernel::validate(input, output, policy, 0);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
new file mode 100644
index 0000000..86c9c31
--- /dev/null
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLComparison.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+    k->configure(input1, input2, output, operation);
+    _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
+}
+
+Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+{
+    return CLComparisonKernel::validate(input1, input2, output, operation);
+}
+
+template <ComparisonOperation COP>
+void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+    k->configure(input1, input2, output, COP);
+    _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
+}
+
+template <ComparisonOperation COP>
+Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLComparisonKernel::validate(input1, input2, output, COP);
+}
+
+// Supported Specializations
+template class CLComparisonStatic<ComparisonOperation::Equal>;
+template class CLComparisonStatic<ComparisonOperation::NotEqual>;
+template class CLComparisonStatic<ComparisonOperation::Greater>;
+template class CLComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class CLComparisonStatic<ComparisonOperation::Less>;
+template class CLComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
index 409d3c9..24c152f 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index e07feb2..9da02c1 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -158,6 +158,18 @@
     _scaled_output.allocator()->allocate();
 }
 
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+                                     const WeightsInfo &weights_info)
+{
+    configure(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+                                      const WeightsInfo &weights_info)
+{
+    return CLDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
 void CLDeconvolutionLayer::run()
 {
     prepare();
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index b5e8fd9..e46647a 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,7 @@
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
         _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
-        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
 
         depth_offset += inputs_vector.at(i)->info()->dimension(2);
     }
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 2e52e8a..dbf71ac 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -28,8 +28,8 @@
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
@@ -41,3 +41,4 @@
 {
     return CLDepthConvertLayerKernel::validate(input, output, policy, shift);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 497cdae..15cbfce 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,18 +26,21 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
 
-CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
-    : _kernel(nullptr), _border_handler()
+CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _kernel(nullptr), _border_handler(), _permute_input_to_nchw(), _permute_weights_to_nchw(), _permute_output_to_nhwc(), _reshape_weights(), _permuted_input(),
+      _permuted_weights(), _permuted_output(), _original_weights(nullptr), _needs_permute(false), _needs_weights_reshape(false), _is_prepared(false)
 {
 }
 
@@ -47,25 +50,79 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
-    if(input->info()->data_layout() == DataLayout::NCHW)
+    const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+
+    _needs_permute         = is_nhwc && (depth_multiplier > 1);
+    _needs_weights_reshape = is_nhwc && (depth_multiplier == 1)
+                             && is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_prepared      = false;
+    _original_weights = weights;
+
+    ICLTensor       *input_to_use   = input;
+    const ICLTensor *weights_to_use = weights;
+    ICLTensor       *output_to_use  = output;
+
+    const bool                      is_stride_1       = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool                      is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+    DepthwiseConvolutionReshapeInfo info;
+    info.c0        = 4;
+    info.transpose = is_stride_1 && is_dot8_supported;
+
+    if(_needs_permute)
     {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input_to_nchw.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+        input_to_use   = &_permuted_input;
+        weights_to_use = &_permuted_weights;
+        output_to_use  = &_permuted_output;
+
         _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
     }
+    else if(is_nhwc)
+    {
+        if(_needs_weights_reshape)
+        {
+            _reshape_weights.configure(weights, &_permuted_weights, info);
+            weights_to_use = &_permuted_weights;
+        }
+        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+    }
     else
     {
-        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
     }
 
+    // Configure kernel
     _kernel->set_target(CLScheduler::get().target());
-    _kernel->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info);
 
+    // Permute output if needed
+    if(_needs_permute)
+    {
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+        _permute_output_to_nhwc.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
+    }
     // Configure border handler
     PixelValue &&zero_value(0.f);
     if(is_data_type_quantized_asymmetric(input->info()->data_type()))
     {
         zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
     }
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+    _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
 }
 
 Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -75,23 +132,99 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
 
-    if(input->data_layout() == DataLayout::NCHW)
+    const bool                      is_nhwc               = input->data_layout() == DataLayout::NHWC;
+    const bool                      needs_permute         = is_nhwc && (depth_multiplier > 1);
+    const bool                      needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
+    const bool                      is_stride_1           = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool                      is_dot8_supported     = dot8_supported(CLKernelLibrary::get().get_device());
+    DepthwiseConvolutionReshapeInfo info;
+    info.c0        = 4;
+    info.transpose = is_stride_1 && is_dot8_supported;
+
+    if(needs_permute)
     {
-        return CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target);
+        TensorShape permuted_input_shape   = input->tensor_shape();
+        TensorShape permuted_weights_shape = weights->tensor_shape();
+        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+
+        permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
+        permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+        permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
+
+        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
+        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
+        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target));
+    }
+    else if(is_nhwc)
+    {
+        if(needs_weights_reshape)
+        {
+            auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
+                                                                                           act_info));
+        }
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target));
     }
 
-    return CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+    return Status{};
 }
 
 void CLDepthwiseConvolutionLayer3x3::run()
 {
+    prepare();
+
+    _memory_group.acquire();
+
+    if(_needs_permute)
+    {
+        _permute_input_to_nchw.run();
+    }
     CLScheduler::get().enqueue(_border_handler);
     CLScheduler::get().enqueue(*_kernel);
+
+    if(_needs_permute)
+    {
+        _permute_output_to_nhwc.run();
+    }
+
+    _memory_group.release();
+}
+
+void CLDepthwiseConvolutionLayer3x3::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_needs_permute)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+            _permuted_weights.allocator()->allocate();
+            _permute_weights_to_nchw.run();
+            _original_weights->mark_as_unused();
+        }
+
+        if(_needs_weights_reshape)
+        {
+            ARM_COMPUTE_ERROR_ON(_needs_permute);
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+            _permuted_weights.allocator()->allocate();
+            CLScheduler::get().enqueue(_reshape_weights);
+            _original_weights->mark_as_unused();
+        }
+        _is_prepared = true;
+    }
 }
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _activationlayer_function(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(),
-      _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+      _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr),
+      _optimised_function(nullptr)
 {
 }
 
@@ -104,98 +237,110 @@
 
     const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
-    const size_t weights_w = weights->info()->dimension(idx_w);
-    const size_t weights_h = weights->info()->dimension(idx_h);
-    const size_t weights_z = weights->info()->dimension(idx_c);
+    const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
 
-    _is_prepared      = false;
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-
-    bool            append_bias = (biases != nullptr) && !_is_quantized;
-    const GPUTarget gpu_target  = CLScheduler::get().target();
-
-    // Calculate output shape
-    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
-    // Output width and height
-    const unsigned int conv_w = output_shape[idx_w];
-    const unsigned int conv_h = output_shape[idx_h];
-
-    // Set up intermediate tensors
-    const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
-    const size_t conv_size  = conv_w * conv_h;
-
-    // Im2Col configuration
-    TensorShape shape_im2col = input->info()->tensor_shape();
-    shape_im2col.set(0, patch_size);
-    shape_im2col.set(1, conv_size);
-    shape_im2col.set(2, weights_z);
-    _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    _im2col_kernel.set_target(gpu_target);
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
-    CLScheduler::get().tune_kernel_static(_im2col_kernel);
-
-    // Weights reshape configuration
-    const TensorShape shape_weights_reshape(patch_size, weights_z);
-    _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
-    _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
-
-    // GEMV configuration
-    DataType    v2mm_dt        = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
-    TensorShape shape_v2mm_out = input->info()->tensor_shape();
-    shape_v2mm_out.set(0, conv_size * weights_z);
-    shape_v2mm_out.set(1, 1);
-    shape_v2mm_out.set(2, 1);
-    _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
-    _v2mm_kernel.set_target(gpu_target);
-    _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    CLScheduler::get().tune_kernel_static(_v2mm_kernel);
-    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-    _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
-
-    // Output staged configuration
-    if(_is_quantized)
+    if(bool(can_run_optimised_3x3_kernel))
     {
-        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
-
-        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
-        _output_reshaped.allocator()->allocate();
+        auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
+        f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+        _optimised_function = std::move(f);
     }
-
-    // Fill borders on inputs
-    PixelValue zero_in(static_cast<int32_t>(0));
-    PixelValue zero_w(static_cast<int32_t>(0));
-    if(_is_quantized)
+    else
     {
-        zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
-        zero_w  = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
-    }
-    BorderSize border_size = _v2mm_kernel.border_size();
-    _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+        const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
-    border_size.bottom = 0;
-    _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+        const size_t weights_w = weights->info()->dimension(idx_w);
+        const size_t weights_h = weights->info()->dimension(idx_h);
+        const size_t weights_z = weights->info()->dimension(idx_c);
 
-    // Allocate intermediate tensors
-    _input_reshaped.allocator()->allocate();
-    _v2mm_output.allocator()->allocate();
+        _is_prepared      = false;
+        _original_weights = weights;
+        _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
+        bool            append_bias = (biases != nullptr) && !_is_quantized;
+        const GPUTarget gpu_target  = CLScheduler::get().target();
 
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
+        // Calculate output shape
+        TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+        // Output width and height
+        const unsigned int conv_w = output_shape[idx_w];
+        const unsigned int conv_h = output_shape[idx_h];
+
+        // Set up intermediate tensors
+        const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
+        const size_t conv_size  = conv_w * conv_h;
+
+        // Im2Col configuration
+        TensorShape shape_im2col = input->info()->tensor_shape();
+        shape_im2col.set(0, patch_size);
+        shape_im2col.set(1, conv_size);
+        shape_im2col.set(2, weights_z);
+        _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+        _im2col_kernel.set_target(gpu_target);
+        _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+        CLScheduler::get().tune_kernel_static(_im2col_kernel);
+
+        // Weights reshape configuration
+        const TensorShape shape_weights_reshape(patch_size, weights_z);
+        _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+        _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
+
+        // GEMV configuration
+        DataType    v2mm_dt        = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
+        TensorShape shape_v2mm_out = input->info()->tensor_shape();
+        shape_v2mm_out.set(0, conv_size * weights_z);
+        shape_v2mm_out.set(1, 1);
+        shape_v2mm_out.set(2, 1);
+        _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+        _v2mm_kernel.set_target(gpu_target);
+        _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+        CLScheduler::get().tune_kernel_static(_v2mm_kernel);
+        _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+        _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
+
+        // Output staged configuration
+        if(_is_quantized)
+        {
+            const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+            int   output_multiplier, output_shift;
+            quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+            _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+            _output_reshaped.allocator()->allocate();
+        }
+
+        // Fill borders on inputs
+        PixelValue zero_in(static_cast<int32_t>(0));
+        PixelValue zero_w(static_cast<int32_t>(0));
+        if(_is_quantized)
+        {
+            zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
+            zero_w  = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+        }
+        BorderSize border_size = _v2mm_kernel.border_size();
+        _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+
+        border_size.bottom = 0;
+        _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+
+        // Allocate intermediate tensors
+        _input_reshaped.allocator()->allocate();
+        _v2mm_output.allocator()->allocate();
+
+        //Configure Activation Layer
+        _is_activationlayer_enabled = act_info.enabled();
+
+        if(_is_activationlayer_enabled)
+        {
+            _activationlayer_function.configure(output, nullptr, act_info);
+        }
     }
 }
 
@@ -204,55 +349,64 @@
 {
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+    const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
 
-    const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-    const bool         append_bias  = (biases != nullptr) && !is_quantized;
-    const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
-    const size_t       weights_w    = weights->dimension(idx_w);
-    const size_t       weights_h    = weights->dimension(idx_h);
-    const size_t       weights_z    = weights->dimension(idx_c);
-    const unsigned int conv_w       = output_shape[idx_w];
-    const unsigned int conv_h       = output_shape[idx_h];
-    const size_t       patch_size   = weights_w * weights_h + ((append_bias) ? 1 : 0);
-    const size_t       conv_size    = conv_w * conv_h;
-
-    TensorShape shape_im2col = input->tensor_shape();
-    shape_im2col.set(0, patch_size);
-    shape_im2col.set(1, conv_size);
-    shape_im2col.set(2, weights_z);
-    TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
-
-    const TensorShape shape_weights_reshape(patch_size, weights_z);
-    TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
-
-    DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
-    TensorShape shape_v2mm_out = input->tensor_shape();
-    shape_v2mm_out.set(0, conv_size * weights_z);
-    shape_v2mm_out.set(1, 1);
-    shape_v2mm_out.set(2, 1);
-    TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
-
-    TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
-
-    if(is_quantized)
+    if(can_run_optimised_3x3_kernel)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
-    }
+        const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
-    // Validate Activation Layer
-    if(act_info.enabled())
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+
+        const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+        const bool         append_bias  = (biases != nullptr) && !is_quantized;
+        const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const size_t       weights_w    = weights->dimension(idx_w);
+        const size_t       weights_h    = weights->dimension(idx_h);
+        const size_t       weights_z    = weights->dimension(idx_c);
+        const unsigned int conv_w       = output_shape[idx_w];
+        const unsigned int conv_h       = output_shape[idx_h];
+        const size_t       patch_size   = weights_w * weights_h + ((append_bias) ? 1 : 0);
+        const size_t       conv_size    = conv_w * conv_h;
+
+        TensorShape shape_im2col = input->tensor_shape();
+        shape_im2col.set(0, patch_size);
+        shape_im2col.set(1, conv_size);
+        shape_im2col.set(2, weights_z);
+        TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+        const TensorShape shape_weights_reshape(patch_size, weights_z);
+        TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+        DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+        TensorShape shape_v2mm_out = input->tensor_shape();
+        shape_v2mm_out.set(0, conv_size * weights_z);
+        shape_v2mm_out.set(1, 1);
+        shape_v2mm_out.set(2, 1);
+        TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+        TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+        if(is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+        }
+
+        // Validate Activation Layer
+        if(act_info.enabled())
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+        }
+    }
+    else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+        CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
     }
-
     return Status{};
 }
 
@@ -260,33 +414,48 @@
 {
     prepare();
 
-    CLScheduler::get().enqueue(_im2col_kernel);
-    CLScheduler::get().enqueue(_v2mm_input_fill_border);
-    CLScheduler::get().enqueue(_v2mm_kernel);
-    CLScheduler::get().enqueue(_vector_to_tensor_kernel);
-    if(_is_quantized)
+    if(_optimised_function != nullptr)
     {
-        CLScheduler::get().enqueue(_output_stage_kernel);
+        _optimised_function->run();
     }
-    if(_is_activationlayer_enabled)
+    else
     {
-        _activationlayer_function.run();
+        CLScheduler::get().enqueue(_im2col_kernel);
+        CLScheduler::get().enqueue(_v2mm_input_fill_border);
+        CLScheduler::get().enqueue(_v2mm_kernel);
+        CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+        if(_is_quantized)
+        {
+            CLScheduler::get().enqueue(_output_stage_kernel);
+        }
+        if(_is_activationlayer_enabled)
+        {
+            _activationlayer_function.run();
+        }
     }
 }
 
 void CLDepthwiseConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if(_optimised_function != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        _optimised_function->prepare();
+    }
+    else
+    {
+        if(!_is_prepared)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-        // Run weights reshaping and mark original weights tensor as unused
-        _weights_reshaped.allocator()->allocate();
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
-        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
-        _original_weights->mark_as_unused();
+            // Run weights reshaping and mark original weights tensor as unused
+            _weights_reshaped.allocator()->allocate();
+            CLScheduler::get().enqueue(_weights_reshape_kernel);
+            CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+            _original_weights->mark_as_unused();
 
-        CLScheduler::get().queue().finish();
-        _is_prepared = true;
+            CLScheduler::get().queue().finish();
+            _is_prepared = true;
+        }
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
new file mode 100644
index 0000000..b7e9a68
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(input, output, ElementWiseUnary::RSQRT);
+    _kernel = std::move(k);
+}
+Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::RSQRT);
+}
+
+void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(input, output, ElementWiseUnary::EXP);
+    _kernel = std::move(k);
+}
+Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
new file mode 100644
index 0000000..28f4b13
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "support/ToolchainSupport.h"
+#include <arm_compute/runtime/CL/functions/CLElementwiseOperations.h>
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace
+{
+void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+{
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+        }
+    }
+}
+} // namespace
+
+void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::ADD, input1, input2, output, policy);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy);
+}
+
+void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::SUB, input1, input2, output, policy);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy);
+}
+
+void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::DIV, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output);
+}
+
+void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::MAX, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+}
+
+void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::MIN, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+}
+
+void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+    k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+    _kernel = std::move(k);
+    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index baa0cf4..e91038f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,32 +33,42 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
 
 namespace
 {
-inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
     bool flag = true;
 
     if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
     {
-        // COMPMID-852
-        if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+        if((m > 1) && n < 16)
         {
-            constexpr float alpha = 3.2f;
-            constexpr float fact0 = 1.51f;
-            constexpr float fact1 = 1.66f;
-            constexpr float ops   = 12.0f;
-            const float     scale = k > 1024 ? 1.07f : 1.0f;
-            flag                  = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+            flag = true;
         }
         else
         {
-            flag = false;
+            // COMPMID-852
+            if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+            {
+                constexpr float alpha = 3.2f;
+                constexpr float fact0 = 1.51f;
+                constexpr float fact1 = 1.66f;
+                constexpr float ops   = 12.0f;
+                const float     scale = k > 1024 ? 1.07f : 1.0f;
+                flag                  = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+            }
+            else
+            {
+                flag = false;
+            }
         }
     }
     else
@@ -73,17 +83,19 @@
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _interleave_kernel(),
-      _transpose_kernel(),
       _mm_kernel(),
       _ma_kernel(),
+      _reshape_lhs_kernel(),
+      _reshape_rhs_kernel(),
+      _mm_reshaped_kernel(),
       _tmp_a(),
       _tmp_b(),
       _original_b(nullptr),
       _is_interleaved_transposed(false),
       _run_addition(false),
       _reshape_b_only_on_first_run(false),
-      _is_prepared(false)
+      _is_prepared(false),
+      _is_new_gemm_reshaped(false)
 {
 }
 
@@ -106,29 +118,52 @@
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _interleave_kernel.set_target(gpu_target);
+    _reshape_lhs_kernel.set_target(gpu_target);
     _mm_kernel.set_target(gpu_target);
 
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    bool      reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const int n                         = b->info()->dimension(0);
-    const int k                         = a->info()->dimension(0);
-    const int depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-    int       mult_transpose1xW_width   = 1;
-    int       mult_interleave4x4_height = 1;
+    DataType           data_type                 = a->info()->data_type();
+    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                         = b->info()->dimension(0);
+    const unsigned int k                         = a->info()->dimension(0);
+    const unsigned int batch_size                = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    int                mult_transpose1xW_width   = 1;
+    int                mult_interleave4x4_height = 1;
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
         mult_transpose1xW_width   = 4;
         mult_interleave4x4_height = 2;
     }
+    GEMMRHSMatrixInfo rhs_info;
+    rhs_info.n0         = 16 / b->info()->element_size();
+    rhs_info.k0         = 1;
+    rhs_info.h0         = mult_transpose1xW_width;
+    rhs_info.interleave = false;
+    rhs_info.transpose  = false;
+
+    GEMMLHSMatrixInfo lhs_info;
+    lhs_info.m0         = 4;
+    lhs_info.k0         = 4;
+    lhs_info.v0         = mult_interleave4x4_height;
+    lhs_info.interleave = true;
+    lhs_info.transpose  = true;
 
     // Check if we need to reshape the matrix A and matrix B
     _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
 
+    // Check if we can run the new reshaped GEMM
+    const auto workload   = static_cast<float>((m * n) / 20.0f);
+    _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
+
+    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
+    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
+    const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
+
     // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(_is_interleaved_transposed)
     {
@@ -145,19 +180,37 @@
         }
         // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
-        // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d());
+        if(_is_new_gemm_reshaped)
+        {
+            GEMMLHSMatrixInfo lhs_info;
 
-        // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+            // Pick up the GEMM configuration
+            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+                                                                                                                 depth_output_gemm3d, reinterpret_input_as_3d));
+        }
+        else
+        {
+            // Configure interleave kernel
+            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+            // Configure transpose kernel
+            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+        }
     }
 
-    // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
-                                                                                                        mult_transpose1xW_width, mult_interleave4x4_height,
-                                                                                                        depth_output_gemm3d, reinterpret_input_as_3d),
-                         gemm_info.fp_mixed_precision());
-    CLScheduler::get().tune_kernel_static(_mm_kernel);
+    if(!_is_new_gemm_reshaped)
+    {
+        // Configure and tune matrix multiply kernel
+        _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
+                             GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
+                             gemm_info.fp_mixed_precision());
+        CLScheduler::get().tune_kernel_static(_mm_kernel);
+    }
 
     if(_is_interleaved_transposed)
     {
@@ -170,7 +223,7 @@
     }
 
     // Configure matrix addition kernel
-    if(beta != 0 && c != nullptr)
+    if(add_matrix_c && !use_fused_add)
     {
         _ma_kernel.configure(c, output, beta);
         _run_addition = true;
@@ -197,13 +250,15 @@
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    bool      reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const int n                         = b->dimension(0);
-    const int k                         = a->dimension(0);
-    int       mult_transpose1xW_width   = 1;
-    int       mult_interleave4x4_height = 1;
-    const int depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    DataType           data_type                 = a->data_type();
+    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                         = b->dimension(0);
+    const unsigned int k                         = a->dimension(0);
+    const unsigned int batch_size                = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    int                mult_transpose1xW_width   = 1;
+    int                mult_interleave4x4_height = 1;
+    const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
@@ -211,9 +266,31 @@
         mult_interleave4x4_height = 2;
     }
 
+    GEMMRHSMatrixInfo rhs_info;
+    rhs_info.n0         = 16 / b->element_size();
+    rhs_info.k0         = 1;
+    rhs_info.h0         = mult_transpose1xW_width;
+    rhs_info.interleave = false;
+    rhs_info.transpose  = false;
+
+    GEMMLHSMatrixInfo lhs_info;
+    lhs_info.m0         = 4;
+    lhs_info.k0         = 4;
+    lhs_info.v0         = mult_interleave4x4_height;
+    lhs_info.interleave = true;
+    lhs_info.transpose  = true;
+
     // Check if we need to reshape the matrix A and matrix B
     const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
 
+    // Check if we can run the new reshaped GEMM
+    const auto workload             = static_cast<float>((m * n) / 20.0f);
+    const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+
+    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
+    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
+    const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+
     // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(run_interleave_transpose)
     {
@@ -227,19 +304,42 @@
         matrix_a_info = &tmp_a_info;
         matrix_b_info = &tmp_b_info;
 
-        // Validate interleave kernel
-        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+        if(is_new_gemm_reshaped)
+        {
+            GEMMLHSMatrixInfo lhs_info;
 
-        // Validate transpose kernel
-        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+            // Pick up the GEMM configuration
+            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+                                                                                     depth_output_gemm3d, reinterpret_input_as_3d)));
+        }
+        else
+        {
+            // Validate interleave kernel
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+            // Validate transpose kernel
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+        }
     }
 
-    // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+    if(!is_new_gemm_reshaped)
+    {
+        // Validate matrix multiply
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
+                                                                         run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+    }
 
-    if(beta != 0 && c != nullptr)
+    if(add_matrix_c && !use_fused_add)
     {
         // Validate matrix addition kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
@@ -257,17 +357,24 @@
     if(_is_interleaved_transposed)
     {
         // Run interleave kernel
-        CLScheduler::get().enqueue(_interleave_kernel, false);
+        CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
 
         if(!_reshape_b_only_on_first_run)
         {
             // Run transpose kernel
-            CLScheduler::get().enqueue(_transpose_kernel, false);
+            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
         }
     }
 
     // Run matrix multiply kernel
-    CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+    if(_is_new_gemm_reshaped)
+    {
+        CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+    }
 
     // Run matrix addition kernel
     if(_run_addition)
@@ -286,10 +393,11 @@
         {
             // Run transpose kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
-            CLScheduler::get().enqueue(_transpose_kernel, false);
+            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
             _original_b->mark_as_unused();
         }
         CLScheduler::get().queue().finish();
         _is_prepared = true;
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 4694aa7..7105e85 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -93,7 +93,7 @@
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
       _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false),
-      _is_activationlayer_enabled(false), _is_prepared(false)
+      _is_activationlayer_enabled(false), _is_prepared(false), _run_addition(true)
 {
 }
 
@@ -101,7 +101,8 @@
                                           int gemm_3d_depth)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col,
+                                           _run_addition));
 
     const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                          gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
@@ -125,13 +126,15 @@
     }
     else
     {
+        // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+        const bool skip_bias_in_gemm = _run_addition || !_skip_im2col;
         // Configure matrix multiply function
-        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+        _mm_gemm.configure(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
     }
 }
 
 Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col)
+                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, bool run_addition)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
@@ -156,8 +159,10 @@
     }
     else
     {
+        // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+        const bool skip_bias_in_gemm = run_addition || !skip_im2col;
         // Perform validation step on Matrix multiply function
-        return CLGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+        return CLGEMM::validate(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
     }
 }
 
@@ -193,6 +198,8 @@
     _skip_col2im                = data_layout == DataLayout::NHWC;
     _append_bias                = (biases != nullptr) && (!_is_quantized);
     _is_activationlayer_enabled = act_info.enabled();
+    // In case of F16, fused bias will be used in GEMM
+    _run_addition = (_skip_im2col) && (_append_bias) && (data_type != DataType::F16);
 
     // Set the GPU target for im2col and col2im
     _im2col_kernel.set_target(CLScheduler::get().target());
@@ -242,7 +249,7 @@
     else if(_append_bias)
     {
         // Configure add bias kernel
-        _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
+        _add_bias_kernel.configure(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE);
     }
 
     // Create GEMM output tensor
@@ -276,9 +283,9 @@
     {
         const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
 
-        const float multiplier  = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
+        const float multiplier        = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
+        int         output_multiplier = 0;
+        int         output_shift      = 0;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
 
         int min_activation = 0;
@@ -375,6 +382,8 @@
     const bool skip_im2col                = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
     const bool skip_col2im                = data_layout == DataLayout::NHWC;
     bool       is_activationlayer_enabled = act_info.enabled();
+    // In case of F16, fused bias will be used in GEMM
+    const bool run_addition = (skip_im2col) && (append_bias) && (data_type != DataType::F16);
 
     ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -429,10 +438,10 @@
         ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
         gemm_input_to_use = &im2col_reshaped_info;
     }
-    else if(append_bias)
+    else if(run_addition)
     {
         // Validate add bias kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE));
     }
 
     // Create GEMM output tensor
@@ -459,9 +468,9 @@
     {
         const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input->quantization_info() : output->quantization_info();
 
-        const float multiplier  = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
+        const float multiplier        = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
+        int         output_multiplier = 0;
+        int         output_shift      = 0;
 
         ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
 
@@ -496,7 +505,7 @@
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, run_addition));
 
     // Validate Col2Im
     if(!skip_col2im)
@@ -537,7 +546,7 @@
         _mm_gemm.run();
     }
 
-    if(_skip_im2col && _append_bias)
+    if(_run_addition)
     {
         CLScheduler::get().enqueue(_add_bias_kernel);
     }
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2d4d231..2a01db7 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,42 +31,25 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
 
 namespace
 {
-inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(unsigned int m, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
-    bool flag = true;
-
-    if(gpu_target_is_in(gpu_target,
-                        GPUTarget::G71, GPUTarget::G72,
-                        GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT))
-    {
-        // COMPMID-852
-        if(k > 256 && m > 4 && reshape_b_only_on_first_run)
-        {
-            flag = ((0.72f + n * 0.10766f) < (n * 0.1284f));
-        }
-        else
-        {
-            flag = false;
-        }
-    }
-    else
-    {
-        flag = m > 1;
-    }
-
-    return flag;
+    return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (m > 1) && (reshape_b_only_on_first_run);
 }
 } // namespace
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
       _mm_kernel(),
+      _mm_reshaped_kernel(),
       _mtx_a_reshape_kernel(),
       _mtx_b_reshape_kernel(),
       _mtx_a_reduction_kernel(),
@@ -81,7 +64,7 @@
       _original_b(nullptr),
       _a_offset(0),
       _b_offset(0),
-      _is_interleaved_transposed(true),
+      _is_gemm_reshaped(true),
       _reshape_b_only_on_first_run(false),
       _is_prepared(false),
       _fuse_output_stage(false)
@@ -108,23 +91,23 @@
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
+    GEMMRHSMatrixInfo rhs_info;
+    GEMMLHSMatrixInfo lhs_info;
 
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    bool          reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const bool    unroll_block              = dot8_supported(CLKernelLibrary::get().get_device());
-    const int     m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const int     n                         = b->info()->dimension(0);
-    const int     k                         = a->info()->dimension(0);
-    const int     depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-    constexpr int mult_transpose1xW_width   = 1;
-    constexpr int mult_interleave4x4_height = 1;
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
 
     // Check if we need to reshape the matrix A and matrix B
-    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, _reshape_b_only_on_first_run, gpu_target);
+    _is_gemm_reshaped = is_gemm_reshaped(m, _reshape_b_only_on_first_run, gpu_target);
 
-    if(_is_interleaved_transposed)
+    if(_is_gemm_reshaped)
     {
         // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
         reinterpret_input_as_3d = false;
@@ -138,11 +121,14 @@
             _memory_group.manage(&_tmp_b);
         }
 
+        // Pick up the GEMM configuration
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
         // Configure interleave kernel
-        _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d(), unroll_block);
+        _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
 
         // Configure transpose kernel
-        _mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+        _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
     }
 
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
@@ -177,10 +163,16 @@
 
         _memory_group.manage(&_mm_result_s32);
 
-        // Configure matrix multiply kernel
-        _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
-                                                                                                              mult_transpose1xW_width, mult_interleave4x4_height,
-                                                                                                              depth_output_gemm3d, reinterpret_input_as_3d));
+        if(_is_gemm_reshaped)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
+        else
+        {
+            // Configure matrix multiply kernel
+            _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
 
         // Configure offset contribution kernel
         _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
@@ -190,17 +182,23 @@
     }
     else
     {
-        // Configure matrix multiply kernel
-        _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
-                                                                                                     mult_transpose1xW_width, mult_interleave4x4_height,
-                                                                                                     depth_output_gemm3d, reinterpret_input_as_3d));
+        if(_is_gemm_reshaped)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
+        else
+        {
+            // Configure matrix multiply kernel
+            _mm_kernel.configure(matrix_a, matrix_b, output, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+        }
 
         // Configure offset contribution kernel
         _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
     }
 
     // Allocate tensors
-    if(_is_interleaved_transposed)
+    if(_is_gemm_reshaped)
     {
         _tmp_a.allocator()->allocate();
         if(!_reshape_b_only_on_first_run)
@@ -233,18 +231,19 @@
     const ITensorInfo *matrix_a_info = a;
     const ITensorInfo *matrix_b_info = b;
 
-    TensorInfo tmp_a_info{};
-    TensorInfo tmp_b_info{};
+    TensorInfo        tmp_a_info{};
+    TensorInfo        tmp_b_info{};
+    GEMMRHSMatrixInfo rhs_info;
+    GEMMLHSMatrixInfo lhs_info;
 
-    bool          reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const int     m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const int     n                         = b->dimension(0);
-    const int     k                         = a->dimension(0);
-    constexpr int mult_transpose1xW_width   = 1;
-    constexpr int mult_interleave4x4_height = 1;
-    const int     depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
 
-    bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
+    bool reshape_matrices = is_gemm_reshaped(m, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
 
     // if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(reshape_matrices)
@@ -252,20 +251,24 @@
         reinterpret_input_as_3d = false;
     }
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
     if(reshape_matrices)
     {
         matrix_a_info = &tmp_a_info;
         matrix_b_info = &tmp_b_info;
 
+        // Pick up the GEMM configuration
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
         // Validate interleave kernel
-        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
         // Validate transpose kernel
-        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+
+        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
     }
 
     TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -292,12 +295,22 @@
     {
         TensorInfo mm_result_s32_info{};
 
-        // Output tensor auto inizialitation if not yet initialized
-        auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_matrices, reshape_info)).set_data_type(DataType::S32));
+        if(reshape_matrices)
+        {
+            // Output tensor auto inizialitation if not yet initialized
+            auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
 
-        // Validate matrix multiply
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_matrices, reshape_info));
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+        }
+        else
+        {
+            // Output tensor auto inizialitation if not yet initialized
+            auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
 
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, false, reshape_info));
+        }
         // Validate offset contribution kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
                                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -309,9 +322,16 @@
     }
     else
     {
-        // Validate matrix multiply
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_matrices, reshape_info));
-
+        if(reshape_matrices)
+        {
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+        }
+        else
+        {
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
+        }
         // Validate offset contribution kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
                                                                                  a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -329,7 +349,7 @@
 
     _memory_group.acquire();
 
-    if(_is_interleaved_transposed)
+    if(_is_gemm_reshaped)
     {
         // Run reshape matrix A
         CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
@@ -348,7 +368,14 @@
     }
 
     // Run matrix multiply
-    CLScheduler::get().enqueue(_mm_kernel, false);
+    if(_is_gemm_reshaped)
+    {
+        CLScheduler::get().enqueue(_mm_reshaped_kernel, false);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_mm_kernel, false);
+    }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
     if(_b_offset != 0)
@@ -374,7 +401,7 @@
 {
     if(!_is_prepared)
     {
-        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
         {
             ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
 
@@ -395,3 +422,4 @@
         _is_prepared = true;
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
new file mode 100644
index 0000000..459438e
--- /dev/null
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGather.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
+    k->configure(input, indices, output, axis);
+    _kernel = std::move(k);
+}
+
+Status CLGather::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+    return CLGatherKernel::validate(input, indices, output, axis);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 5dd1202..c50132e 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,7 @@
       _memset_kernel(),
       _padded_copy_kernel(),
       _cpp_nms_kernel(),
+      _is_nhwc(false),
       _deltas_permuted(),
       _deltas_flattened(),
       _scores_permuted(),
@@ -60,10 +61,11 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
 
+    _is_nhwc                         = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType data_type         = deltas->info()->data_type();
-    const int      num_anchors       = scores->info()->dimension(2);
-    const int      feat_width        = scores->info()->dimension(0);
-    const int      feat_height       = scores->info()->dimension(1);
+    const int      num_anchors       = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int      feat_width        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int      feat_height       = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
     const int      total_num_anchors = num_anchors * feat_width * feat_height;
     const int      pre_nms_topN      = info.pre_nms_topN();
     const int      post_nms_topN     = info.post_nms_topN();
@@ -77,21 +79,37 @@
     _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, data_type));
 
     // Permute and reshape deltas
-    _memory_group.manage(&_deltas_permuted);
-    _memory_group.manage(&_deltas_flattened);
-    _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
-    _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
-    _deltas_permuted.allocator()->allocate();
+    if(!_is_nhwc)
+    {
+        _memory_group.manage(&_deltas_permuted);
+        _memory_group.manage(&_deltas_flattened);
+        _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+        _deltas_permuted.allocator()->allocate();
+    }
+    else
+    {
+        _memory_group.manage(&_deltas_flattened);
+        _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+    }
 
     const TensorShape flatten_shape_scores(1, total_num_anchors);
     _scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, data_type));
 
     // Permute and reshape scores
-    _memory_group.manage(&_scores_permuted);
-    _memory_group.manage(&_scores_flattened);
-    _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
-    _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
-    _scores_permuted.allocator()->allocate();
+    if(!_is_nhwc)
+    {
+        _memory_group.manage(&_scores_permuted);
+        _memory_group.manage(&_scores_flattened);
+        _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+        _scores_permuted.allocator()->allocate();
+    }
+    else
+    {
+        _memory_group.manage(&_scores_flattened);
+        _flatten_scores_kernel.configure(scores, &_scores_flattened);
+    }
 
     // Bounding box transform
     _memory_group.manage(&_all_proposals);
@@ -141,11 +159,12 @@
                                           const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW, DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
 
-    const int num_anchors       = scores->dimension(2);
-    const int feat_width        = scores->dimension(0);
-    const int feat_height       = scores->dimension(1);
+    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -156,14 +175,21 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
 
     TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if(scores->data_layout() == DataLayout::NHWC)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+    }
 
     TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
-
     TensorInfo scores_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
     TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
@@ -236,9 +262,12 @@
     CLScheduler::get().enqueue(_compute_anchors_kernel, false);
 
     // Transpose and reshape the inputs
-    CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+    if(!_is_nhwc)
+    {
+        CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+        CLScheduler::get().enqueue(_permute_scores_kernel, false);
+    }
     CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
-    CLScheduler::get().enqueue(_permute_scores_kernel, false);
     CLScheduler::get().enqueue(_flatten_scores_kernel, false);
 
     // Build the boxes
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 4f709d5..2e3c6d7 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -32,8 +32,8 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
@@ -81,3 +81,4 @@
 
     _memory_group.release();
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index a89c4e3..f01b1b8 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -110,9 +110,9 @@
     _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
     _forget_gate_out2.allocator()->allocate();
     _memory_group.manage(&_forget_gate_out5);
-    _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+    _accum_forget_gate1.configure(ArithmeticOperation::ADD, &_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+    _forget_gate_out1.allocator()->allocate();
     CLTensor *forget_gate_out = &_forget_gate_out5;
-
     if(lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -129,17 +129,18 @@
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     // input_gate = 1 - forget_gate, with CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    CLTensor *input_gate_out = &_input_gate_out1;
     if(lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -160,17 +161,23 @@
         _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
         _input_gate_out2.allocator()->allocate();
         _memory_group.manage(&_input_gate_out4);
-        _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+        _accum_input_gate1.configure(ArithmeticOperation::ADD, &_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+        _input_gate_out3.allocator()->allocate();
+        input_gate_out = &_input_gate_out4;
         if(_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out5);
             _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
             _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _input_gate_out4.allocator()->allocate();
             _input_gate_out5.allocator()->allocate();
+            input_gate_out = &_input_gate_out1;
         }
-        _input_gate_out3.allocator()->allocate();
-        _input_gate_out4.allocator()->allocate();
-        _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        else
+        {
+            _input_gate_out1.allocator()->allocate();
+        }
+        _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -190,14 +197,13 @@
     _gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
-    _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _accum_cell_state1.configure(ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     _cell_state_out4.allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _forget_gate_out1.allocator()->allocate();
-    _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _accum_cell_state2.configure(ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
@@ -223,7 +229,7 @@
     _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
     _output2.allocator()->allocate();
     _memory_group.manage(&_output5);
-    _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
+    _accum_output1.configure(ArithmeticOperation::ADD, &_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
     _output3.allocator()->allocate();
     CLTensor *output_gate_out = &_output5;
     if(lstm_params.has_peephole_opt())
@@ -284,13 +290,13 @@
     std::vector<ICLTensor *> scratch_inputs;
     if(!lstm_params.has_cifg_opt())
     {
-        scratch_inputs.emplace_back(&_input_gate_out1);
+        scratch_inputs.emplace_back(input_gate_out);
     }
     scratch_inputs.emplace_back(&_cell_state_out1);
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
     _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
-    _input_gate_out1.allocator()->allocate();
+    input_gate_out->allocator()->allocate();
     _cell_state_out1.allocator()->allocate();
     forget_gate_out->allocator()->allocate();
     output_gate_out->allocator()->allocate();
@@ -364,7 +370,7 @@
     // Validate forget gate
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -396,7 +402,7 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
@@ -544,4 +550,4 @@
     _concat_scratch_buffer.run();
 
     _memory_group.release();
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 7e5278f..559b57f 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
 #include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
 #include "support/ToolchainSupport.h"
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 32d8f15..8489fab 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,7 @@
     _norm_kernel.configure(input, output, norm_info);
 
     // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index de43c7d..3aa1b1e 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,21 +34,21 @@
 {
 }
 
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding)
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value)
 {
     // Copy the input to the output
     _copy_kernel.configure(input, output, padding);
 
     // Set the pages of the output to zero
-    _memset_kernel.configure(output, PixelValue());
+    _memset_kernel.configure(output, constant_value);
 
     // Fill padding on the first two dimensions with zeros
-    _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT);
+    _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT, constant_value);
 }
 
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding)
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, constant_value));
     ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
 
     return Status{};
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 1809e6e..63f00ac 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -60,7 +60,7 @@
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
 
     return Status{};
@@ -90,7 +90,7 @@
     _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _memory_group.manage(&_add_output);
 
-    _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+    _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
 
     _fully_connected_out.allocator()->allocate();
     _gemm_output.allocator()->allocate();
@@ -127,4 +127,4 @@
 
         _is_prepared = true;
     }
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 0f480ee..7bb4178 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,7 @@
 
 using namespace arm_compute;
 
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     // Configure ROI pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
new file mode 100644
index 0000000..b2cd472
--- /dev/null
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRange.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLRange::configure(ICLTensor *output, const float start, const float end, const float step)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLRangeKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(output, start, end, step);
+    _kernel = std::move(k);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status CLRange::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+    return CLRangeKernel::validate(output, start, end, step);
+}
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 1016ff7..b2d0f81 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,22 +45,31 @@
     _reduced_outs      = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
     _keep_dims         = keep_dims;
 
+    Coordinates axis_local = reduction_axis;
+    const int   input_dims = input->info()->num_dimensions();
+
+    // Convert negative axis
+    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+        axis_local[i] = wrap_around(axis_local[i], input_dims);
+    }
+
     // Perform reduction for every axis
     for(unsigned int i = 0; i < _reduction_ops; ++i)
     {
         TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
-        out_shape.set(reduction_axis[i], 1);
+        out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
             _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
             _memory_group.manage(_reduced_outs.get() + i);
-            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
@@ -77,11 +86,10 @@
 
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
-        Coordinates axis_copy = reduction_axis;
-        std::sort(axis_copy.begin(), axis_copy.begin() + _reduction_ops);
+        std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
         for(unsigned int i = 0; i < _reduction_ops; ++i)
         {
-            out_shape.remove_dimension(axis_copy[i] - i);
+            out_shape.remove_dimension(axis_local[i] - i);
         }
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
         _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
@@ -90,22 +98,43 @@
 
 Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
-    ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
-    for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
+    TensorShape out_shape = input->tensor_shape();
+
+    Coordinates        axis_sorted   = reduction_axis;
+    const unsigned int reduction_ops = reduction_axis.num_dimensions();
+    const int          input_dims    = input->num_dimensions();
+
+    // Convert negative axis
+    for(unsigned int i = 0; i < reduction_ops; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis[i] > 3);
-        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
+        axis_sorted[i] = wrap_around(axis_sorted[i], input_dims);
+    }
+
+    std::sort(axis_sorted.begin(), axis_sorted.begin() + reduction_ops);
+    for(unsigned int i = 0; i < reduction_ops; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(axis_sorted[i] > 3);
+        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_sorted[i]) > input->num_dimensions() - 1);
         if(output->total_size() > 0 && keep_dims)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_sorted[i]) != 1);
         }
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+        if(keep_dims)
+        {
+            out_shape.set(axis_sorted[i], 1);
+        }
+        else
+        {
+            out_shape.remove_dimension(axis_sorted[i] - i);
+        }
     }
 
+    const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
     return Status{};
 }
 
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index c5447ff..3d82e3f 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,15 +56,19 @@
 } // namespace
 
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
+    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
 {
 }
 
 Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
-
-    if(axis == 0 && !is_data_type_quantized(input->data_type()))
+    bool               is_serial     = is_data_type_quantized(input->data_type()) || axis != 0;
+    if(is_serial)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
+    }
+    else
     {
         // Create temporary tensor infos
         auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
@@ -81,17 +85,25 @@
         }
 
         ReductionOperation first_kernel_op;
+        ReductionOperation intermediate_kernel_op;
         ReductionOperation last_kernel_op;
         switch(op)
         {
             case ReductionOperation::SUM:
             case ReductionOperation::MEAN_SUM:
-                first_kernel_op = ReductionOperation::SUM;
-                last_kernel_op  = op;
+                first_kernel_op        = ReductionOperation::SUM;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = op;
                 break;
             case ReductionOperation::SUM_SQUARE:
-                first_kernel_op = ReductionOperation::SUM_SQUARE;
-                last_kernel_op  = ReductionOperation::SUM;
+                first_kernel_op        = ReductionOperation::SUM_SQUARE;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = ReductionOperation::SUM;
+                break;
+            case ReductionOperation::PROD:
+                first_kernel_op        = ReductionOperation::PROD;
+                intermediate_kernel_op = ReductionOperation::PROD;
+                last_kernel_op         = ReductionOperation::PROD;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
@@ -103,17 +115,13 @@
         // Validate ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < num_of_stages - 1; ++i)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
         }
 
         // Validate ReductionOperation on the last stage
         const unsigned int last_stage = num_of_stages - 1;
         ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
     }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
-    }
 
     return Status{};
 }
@@ -122,65 +130,77 @@
 {
     _num_of_stages  = calculate_number_of_stages(input->info(), axis);
     _reduction_axis = axis;
-    _is_quantized   = is_data_type_quantized(input->info()->data_type());
+    _is_serial      = is_data_type_quantized(input->info()->data_type()) || axis != 0;
 
     // Configure reduction operation kernels
     _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
 
     // Create temporary tensors
-    if(axis == 0 && !_is_quantized)
+    if(_is_serial)
+    {
+        _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+    }
+    else
     {
         _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
-        _sums_vector            = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+        _results_vector         = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
         TensorShape shape{ input->info()->tensor_shape() };
         for(unsigned int i = 0; i < _num_of_stages - 1; i++)
         {
             shape.set(0, ceil(shape.x() / 128.f));
-            _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+            _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
         }
 
         // Apply ReductionOperation only on first kernel
-        _memory_group.manage(_sums_vector.get());
+        _memory_group.manage(_results_vector.get());
 
         ReductionOperation first_kernel_op;
+        ReductionOperation intermediate_kernel_op;
         ReductionOperation last_kernel_op;
+        PixelValue         pixelValue;
         switch(op)
         {
             case ReductionOperation::SUM:
             case ReductionOperation::MEAN_SUM:
-                first_kernel_op = ReductionOperation::SUM;
-                last_kernel_op  = op;
+                first_kernel_op        = ReductionOperation::SUM;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = op;
+                pixelValue             = PixelValue();
                 break;
             case ReductionOperation::SUM_SQUARE:
-                first_kernel_op = ReductionOperation::SUM_SQUARE;
-                last_kernel_op  = ReductionOperation::SUM;
+                first_kernel_op        = ReductionOperation::SUM_SQUARE;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = ReductionOperation::SUM;
+                pixelValue             = PixelValue();
+                break;
+            case ReductionOperation::PROD:
+                first_kernel_op        = ReductionOperation::PROD;
+                intermediate_kernel_op = ReductionOperation::PROD;
+                last_kernel_op         = ReductionOperation::PROD;
+                pixelValue             = PixelValue(1, input->info()->data_type());
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
         }
 
-        _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
-        _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+        _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
-            _memory_group.manage(_sums_vector.get() + i);
-            _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
-            _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
-            _sums_vector[i - 1].allocator()->allocate();
+            _memory_group.manage(_results_vector.get() + i);
+            _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
+            _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+            _results_vector[i - 1].allocator()->allocate();
         }
 
         // Apply ReductionOperation on the last stage
         const unsigned int last_stage  = _num_of_stages - 1;
         const unsigned int input_width = input->info()->dimension(0);
-        _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
-        _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
-        _sums_vector[last_stage - 1].allocator()->allocate();
-    }
-    else
-    {
-        _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+        _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
+        _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+        _results_vector[last_stage - 1].allocator()->allocate();
     }
 }
 
@@ -188,7 +208,11 @@
 {
     _memory_group.acquire();
 
-    if(_reduction_axis == 0 && !_is_quantized)
+    if(_is_serial)
+    {
+        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
+    }
+    else
     {
         for(unsigned int i = 0; i < _num_of_stages; ++i)
         {
@@ -196,10 +220,6 @@
             CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
         }
     }
-    else
-    {
-        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
-    }
 
     _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
new file mode 100644
index 0000000..0f86b9f
--- /dev/null
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
+
+#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLReverseKernel>();
+    k->configure(input, output, axis);
+    _kernel = std::move(k);
+}
+
+Status CLReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+    return CLReverseKernel::validate(input, output, axis);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
new file mode 100644
index 0000000..90c368e
--- /dev/null
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSelect.h"
+
+#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLSelectKernel>();
+    k->configure(c, x, y, output);
+    _kernel = std::move(k);
+}
+
+Status CLSelect::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+    return CLSelectKernel::validate(c, x, y, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index bef7eca..f630853 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -36,10 +36,10 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     // Get absolute end coordinates
-    const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->info()->tensor_shape(), ends);
+    const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
 
     auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
-    k->configure(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+    k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
     _kernel = std::move(k);
 }
 
@@ -54,8 +54,8 @@
     }));
 
     // Get absolute end coordinates
-    const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->tensor_shape(), ends);
+    const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
 
-    return CLStridedSliceKernel::validate(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+    return CLStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 76c1e18..a24b72e 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -33,20 +33,19 @@
 namespace arm_compute
 {
 CLSpaceToBatchLayer::CLSpaceToBatchLayer()
-    : _space_to_batch_kernel(), _output(nullptr), _has_padding(false)
+    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
 {
 }
 
 void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
     }
-
-    _output = output;
     _space_to_batch_kernel.configure(input, block_shape, paddings, output);
 }
 
@@ -57,42 +56,35 @@
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
     }
-
-    _output = output;
     _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
 {
-    return CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+    return Status{};
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
                                      const ITensorInfo *output)
 {
-    return CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+    return Status{};
 }
 
 void CLSpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    // TODO(micspy01): replace with memset once ready
     if(_has_padding)
     {
-        _output->map(CLScheduler::get().queue(), true);
-        if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
-        {
-            const uint8_t quantized_zero = _output->info()->quantization_info().offset;
-            std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
-        }
-        else
-        {
-            memset(_output->buffer(), 0, _output->info()->total_size());
-        }
-        _output->unmap(CLScheduler::get().queue());
+        CLScheduler::get().enqueue(_memset_kernel, true);
     }
-
     CLScheduler::get().enqueue(_space_to_batch_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
new file mode 100644
index 0000000..71327fe
--- /dev/null
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <complex>
+
+#include "arm_compute/runtime/CL/functions/CLStackLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLStackLayer::CLStackLayer() // NOLINT
+    : _input(),
+      _stack_kernels(),
+      _num_inputs(0)
+{
+}
+
+void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+{
+    _num_inputs    = input.size();
+    _stack_kernels = arm_compute::support::cpp14::make_unique<CLStackLayerKernel[]>(_num_inputs);
+
+    // Wrap around negative values
+    const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+    }
+}
+
+Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input.empty());
+
+    // Wrap around negative values
+    const size_t       rank   = input[0]->num_dimensions();
+    const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
+
+    const unsigned int num_inputs = input.size();
+
+    for(unsigned int i = 0; i < num_inputs; i++)
+    {
+        // All the tensors must have the same rank
+        ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
+        // Validate Kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
+    }
+
+    return Status{};
+}
+
+void CLStackLayer::run()
+{
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        CLScheduler::get().enqueue(_stack_kernels[i], false);
+    }
+}
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
new file mode 100644
index 0000000..ec6a4ab
--- /dev/null
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTile.h"
+
+#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLTileKernel>();
+    k->configure(input, output, multiples);
+    _kernel = std::move(k);
+}
+
+Status CLTile::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+    return CLTileKernel::validate(input, output, multiples);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
new file mode 100644
index 0000000..428d091
--- /dev/null
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLUnstack.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
+{
+    return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
+}
+
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+{
+    // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
+    Coordinates slice_end;
+    slice_start.set_num_dimensions(input_num_dimensions);
+    slice_end.set_num_dimensions(input_num_dimensions);
+    for(size_t k = 0; k < input_num_dimensions; ++k)
+    {
+        slice_start.set(k, 0);
+        slice_end.set(k, -1);
+    }
+    slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(slice_end);
+}
+} // namespace
+
+CLUnstack::CLUnstack() // NOLINT
+    : _num_slices(0),
+      _strided_slice_vector()
+{
+}
+
+void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+{
+    std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+        return t->info();
+    });
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
+
+    // Wrap around negative values
+    const unsigned int axis_u = wrap_axis(axis, input->info());
+    _num_slices               = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
+    _strided_slice_vector     = arm_compute::support::cpp14::make_unique<CLStridedSlice[]>(_num_slices);
+
+    Coordinates slice_start;
+    int32_t     slice_end_mask;
+    setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
+    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    {
+        // Adjusts start and end coordinates to take a 2D slice at a time
+        slice_start.set(axis_u, slice);
+        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+    }
+}
+
+Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &output_vector, int axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_vector.empty());
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < (-static_cast<int>(input->tensor_shape().num_dimensions())));
+    ARM_COMPUTE_RETURN_ERROR_ON(axis >= static_cast<int>(input->tensor_shape().num_dimensions()));
+    const unsigned int num_slices = std::min(output_vector.size(), input->dimension(wrap_axis(axis, input)));
+    ARM_COMPUTE_RETURN_ERROR_ON(num_slices > input->dimension(wrap_axis(axis, input)));
+    ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
+    Coordinates slice_start;
+    int32_t     slice_end_mask;
+    for(size_t k = 0; k < num_slices; ++k)
+    {
+        slice_start.set(wrap_axis(axis, input), k);
+        setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
+        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+    }
+    return Status{};
+}
+
+void CLUnstack::run()
+{
+    for(unsigned i = 0; i < _num_slices; ++i)
+    {
+        _strided_slice_vector[i].run();
+    }
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index 46a2d80..d0801a6 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,8 +50,8 @@
     ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
 
     // Output auto inizialitation if not yet initialized
-    TensorInfo  tmp_output_info = *output->clone();
-    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    TensorInfo        tmp_output_info = *output->clone();
+    const TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
     switch(num_inputs)
@@ -90,7 +90,7 @@
     {
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 1abcb67..069196e 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -84,8 +84,8 @@
 } // namespace
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
-      _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
+    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr),
+      _is_prepared(false)
 {
 }
 
@@ -133,14 +133,7 @@
                                                                                                  (input->info()->data_type() == DataType::F16)));
 
     // Configure output transform
-    _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
-
-    // Configure activation layer
-    _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
+    _output_transform.configure(&_batched_mm_output, biases, output, winograd_info, act_info);
 
     // Allocate temporary tensors
     _input0.allocator()->allocate();
@@ -216,11 +209,6 @@
     // Run output transform
     CLScheduler::get().enqueue(_output_transform);
 
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
-
     _memory_group.release();
 }
 
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 09e8456..7361eb2 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@
     auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
     k->configure(input, output, winograd_info);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)