arm_compute v18.05

commit: b3a371bc429d2ba45e56baaf239d8200c2662a74 [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Wed May 23 11:36:53 2018 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Wed May 23 14:55:11 2018 +0100
tree: 554525e415c303d64a08722a755397852ebbb8e4
parent: 67c8c91522e5be8156b77f57e63c0253535c902a [diff]
diff --git a/tests/validation/CL/BatchNormalizationLayer.cpp b/tests/validation/CL/BatchNormalizationLayer.cpp
index ef53515..f6dc6b3 100644
--- a/tests/validation/CL/BatchNormalizationLayer.cpp
+++ b/tests/validation/CL/BatchNormalizationLayer.cpp

@@ -32,6 +32,7 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/BatchNormalizationLayerFixture.h"
 
@@ -61,15 +62,25 @@
 template <typename T>
 using CLBatchNormalizationLayerFixture = BatchNormalizationLayerValidationFixture<CLTensor, CLAccessor, CLBatchNormalizationLayer, T>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::RandomBatchNormalizationLayerDataset(), framework::dataset::make("DataType", { DataType::QS8, DataType::QS16, DataType::F16, DataType::F32 })),
-               shape0, shape1, epsilon, dt)
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                                   combine(framework::dataset::make("UseBeta", { false, true }),
+                                                                                           framework::dataset::make("UseGamma", { false, true }))),
+                                                                           framework::dataset::make("DataType", { DataType::QS8, DataType::QS16, DataType::F16, DataType::F32 })),
+                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+               shape0, shape1, epsilon, use_gamma, use_beta, dt, data_layout)
 {
     // Set fixed point position data type allowed
     const int fixed_point_position = (arm_compute::is_data_type_fixed_point(dt)) ? 3 : 0;
 
+    TensorShape src_dst_shapes = shape0;
+    if(data_layout == DataLayout::NHWC)
+    {
+        permute(src_dst_shapes, PermutationVector(2U, 0U, 1U));
+    }
+
     // Create tensors
-    CLTensor src   = create_tensor<CLTensor>(shape0, dt, 1, fixed_point_position);
-    CLTensor dst   = create_tensor<CLTensor>(shape0, dt, 1, fixed_point_position);
+    CLTensor src   = create_tensor<CLTensor>(src_dst_shapes, dt, 1, fixed_point_position, QuantizationInfo(), data_layout);
+    CLTensor dst   = create_tensor<CLTensor>(src_dst_shapes, dt, 1, fixed_point_position, QuantizationInfo(), data_layout);
     CLTensor mean  = create_tensor<CLTensor>(shape1, dt, 1, fixed_point_position);
     CLTensor var   = create_tensor<CLTensor>(shape1, dt, 1, fixed_point_position);
     CLTensor beta  = create_tensor<CLTensor>(shape1, dt, 1, fixed_point_position);
@@ -77,10 +88,12 @@
 
     // Create and Configure function
     CLBatchNormalizationLayer norm;
-    norm.configure(&src, &dst, &mean, &var, &beta, &gamma, epsilon);
+    CLTensor                 *beta_ptr  = use_beta ? &beta : nullptr;
+    CLTensor                 *gamma_ptr = use_gamma ? &gamma : nullptr;
+    norm.configure(&src, &dst, &mean, &var, beta_ptr, gamma_ptr, epsilon);
 
     // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape0);
+    const ValidRegion valid_region = shape_to_valid_region(src_dst_shapes);
     validate(dst.info()->valid_region(), valid_region);
 }
 
@@ -150,9 +163,12 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                                                                   combine(framework::dataset::make("UseBeta", { false, true }),
+                                                                                                                           framework::dataset::make("UseGamma", { false, true }))),
                                                                                                                    act_infos),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)))
+                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32, 0);
@@ -160,9 +176,12 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                                                                  combine(framework::dataset::make("UseBeta", { false, true }),
+                                                                                                                          framework::dataset::make("UseGamma", { false, true }))),
                                                                                                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))),
-                                                                                                                  framework::dataset::make("DataType", DataType::F16)))
+                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, 0);
@@ -175,10 +194,14 @@
 using CLBatchNormalizationLayerFixedPointFixture = BatchNormalizationLayerValidationFixedPointFixture<CLTensor, CLAccessor, CLBatchNormalizationLayer, T>;
 
 TEST_SUITE(QS8)
-FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
-                       framework::dataset::make("DataType", DataType::QS8)),
-                       framework::dataset::make("FractionalBits", 1, 6)))
+FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                       framework::dataset::make("UseBeta", false)),
+                                                               framework::dataset::make("UseGamma", false)),
+                                                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                               framework::dataset::make("DataType", DataType::QS8)),
+                                       framework::dataset::make("DataLayout", DataLayout::NCHW)),
+                               framework::dataset::make("FractionalBits", 1, 6)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qs8, 0);
@@ -186,10 +209,14 @@
 TEST_SUITE_END()
 
 TEST_SUITE(QS16)
-FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
-                       framework::dataset::make("DataType", DataType::QS16)),
-                       framework::dataset::make("FractionalBits", 1, 14)))
+FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                       framework::dataset::make("UseBeta", false)),
+                                                               framework::dataset::make("UseGamma", false)),
+                                                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                               framework::dataset::make("DataType", DataType::QS16)),
+                                       framework::dataset::make("DataLayout", DataLayout::NCHW)),
+                               framework::dataset::make("FractionalBits", 1, 14)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qs16, 0);

diff --git a/tests/validation/CL/ChannelCombine.cpp b/tests/validation/CL/ChannelCombine.cpp
new file mode 100644
index 0000000..d8eccba
--- /dev/null
+++ b/tests/validation/CL/ChannelCombine.cpp

@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMultiImage.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ConvertPolicyDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ChannelCombineFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+inline void validate_configuration(const TensorShape &shape, Format format)
+{
+    const int num_planes = num_planes_from_format(format);
+
+    // Create tensors
+    CLMultiImage          dst     = create_multi_image<CLMultiImage>(shape, format);
+    std::vector<CLTensor> ref_src = create_tensor_planes<CLTensor>(shape, format);
+
+    // Create and configure function
+    CLChannelCombine channel_combine;
+
+    if(num_planes == 1)
+    {
+        const CLTensor *tensor_extra = ((Format::RGBA8888 == format) ? &ref_src[3] : nullptr);
+
+        channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], tensor_extra, dst.cl_plane(0));
+    }
+    else
+    {
+        channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], &dst);
+    }
+}
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(ChannelCombine)
+
+template <typename T>
+using CLChannelCombineFixture = ChannelCombineValidationFixture<CLMultiImage, CLTensor, CLAccessor, CLChannelCombine, T>;
+
+TEST_SUITE(Configuration)
+DATA_TEST_CASE(RGBA, framework::DatasetMode::ALL, combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })),
+               shape, format)
+{
+    validate_configuration(shape, format);
+}
+DATA_TEST_CASE(YUV, framework::DatasetMode::ALL, combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })),
+               shape, format)
+{
+    validate_configuration(shape, format);
+}
+
+DATA_TEST_CASE(YUVPlanar, framework::DatasetMode::ALL, combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444, Format::NV12, Format::NV21 })),
+               shape, format)
+{
+    validate_configuration(shape, format);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(RGBA)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+TEST_SUITE_END()
+
+TEST_SUITE(YUV)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+TEST_SUITE_END()
+
+TEST_SUITE(YUVPlanar)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::NV12, Format::NV21, Format::IYUV, Format::YUV444 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::NV12, Format::NV21, Format::IYUV, Format::YUV444 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(CLAccessor(*_target.cl_plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+TEST_SUITE_END()
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/ChannelShuffle.cpp b/tests/validation/CL/ChannelShuffle.cpp
new file mode 100644
index 0000000..41813c4
--- /dev/null
+++ b/tests/validation/CL/ChannelShuffle.cpp

@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ChannelShuffleLayerDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ChannelShuffleLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(ChannelShuffle)
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType", { DataType::S8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32 })),
+               shape, num_groups, data_type)
+{
+    // Create tensors
+    CLTensor ref_src = create_tensor<CLTensor>(shape, data_type);
+    CLTensor dst     = create_tensor<CLTensor>(shape, data_type);
+
+    // Create and Configure function
+    CLChannelShuffleLayer channel_shuffle_func;
+    channel_shuffle_func.configure(&ref_src, &dst, num_groups);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(dst.info()->valid_region(), valid_region);
+}
+
+template <typename T>
+using CLChannelShuffleLayerFixture = ChannelShuffleLayerValidationFixture<CLTensor, CLAccessor, CLChannelShuffleLayer, T>;
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelShuffleLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallRandomChannelShuffleLayerDataset(),
+                                                                                                                   framework::dataset::make("DataType",
+                                                                                                                           DataType::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelShuffleLayerFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType",
+                                                                                                                 DataType::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelShuffleLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType",
+                                                                                                                DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelShuffleLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType",
+                                                                                                              DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLChannelShuffleLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLChannelShuffleLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType",
+                                                                                                               DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/ConvertFullyConnectedWeights.cpp b/tests/validation/CL/ConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..f67c447
--- /dev/null
+++ b/tests/validation/CL/ConvertFullyConnectedWeights.cpp

@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+auto params = combine(framework::dataset::make("WeightsWidth", { 16, 32, 64 }), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }));
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(ConvertFullyConnectedWeights)
+
+template <typename T>
+using CLConvertFullyConnectedWeightsFixture = ConvertFullyConnectedWeightsValidationFixture<CLTensor, CLAccessor, CLConvertFullyConnectedWeights, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLConvertFullyConnectedWeightsFixture<float>, framework::DatasetMode::ALL, combine(datasets::Small3DShapes(), combine(params, framework::dataset::make("DataType",
+                                                                                                                    DataType::F32))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLConvertFullyConnectedWeightsFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::Large3DShapes(), combine(params, framework::dataset::make("DataType",
+                                                                                                                        DataType::F32))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLConvertFullyConnectedWeightsFixture<half>, framework::DatasetMode::ALL, combine(datasets::Small3DShapes(), combine(params, framework::dataset::make("DataType",
+                                                                                                                   DataType::F16))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLConvertFullyConnectedWeightsFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large3DShapes(), combine(params, framework::dataset::make("DataType",
+                                                                                                                       DataType::F16))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLConvertFullyConnectedWeightsFixture<uint8_t>, framework::DatasetMode::ALL, combine(datasets::Small3DShapes(), combine(params, framework::dataset::make("DataType",
+                                                                                                                      DataType::QASYMM8))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLConvertFullyConnectedWeightsFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large3DShapes(), combine(params,
+                       framework::dataset::make("DataType",
+                                                DataType::QASYMM8))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index 9b857c8..ec729b3 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp

@@ -45,6 +45,7 @@
 {
 namespace
 {
+constexpr AbsoluteTolerance<float>  absolute_tolerance_float(0.0001f);    /**< Absolute Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 RelativeTolerance<float>            tolerance_f32(0.05f);                 /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr AbsoluteTolerance<float>  tolerance_fixed(1.0f);                /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
@@ -60,64 +61,100 @@
     DataType::QS16,
     DataType::QASYMM8,
 });
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+});
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(ConvolutionLayer)
 
-DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-                                                                                               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
-                                                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
-                                                                                                                        TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
-                                                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
-                                                                                                                        TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
-                                                                                                                                     }),
-                                                                                               framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
-                                                                                                                        TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
-                                                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
-                                                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
-                                                                                                                        TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
-                                                                                                                                       })),
-                                                                                           framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(19U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(19U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(16U), 1, DataType::F32, 0)
-                                                                                                                                  })),
-                                                                                       framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
-                                                                                                                TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
-                                                                                                                TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
-                                                                                                                TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
-                                                                                                                TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0)
-                                                                                                                              })),
-                                                                                   framework::dataset::make("ConvInfo", { PadStrideInfo(1, 2, 1, 1),
-                                                                                                            PadStrideInfo(1, 2, 1, 1),
-                                                                                                            PadStrideInfo(1, 1, 0, 0),
-                                                                                                            PadStrideInfo(2, 1, 0, 0),
-                                                                                                            PadStrideInfo(3, 2, 1, 0)
-                                                                                                                        })),
-                                                                               framework::dataset::make("GpuTarget", { GPUTarget::BIFROST,
-                                                                                                                       GPUTarget::MIDGARD,
-                                                                                                                       GPUTarget::G70,
-                                                                                                                       GPUTarget::MIDGARD,
-                                                                                                                       GPUTarget::BIFROST
-                                                                                                                     })),
-
-                                                                           framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
-               input_info, weights_info, biases_info, output_info, conv_info, gpu_target, expected)
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
+                                                                                                   framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(23U, 27U, 31U, 4U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(17U, 31U, 32U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0)
+                                                                                                                                         }),
+                                                                                                   framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(3U, 3U, 31U, 21U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0),
+                                                                                                           TensorInfo(TensorShape(5U, 5U, 32U, 19U), 1, DataType::F32, 0),
+                                                                                                           TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0)
+                                                                                                                                           })),
+                                                                                               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32, 0)
+                                                                                                                                      })),
+                                                                                           framework::dataset::make("ConvInfo", { PadStrideInfo(1, 2, 1, 1),
+                                                                                                                    PadStrideInfo(1, 2, 1, 1),
+                                                                                                                    PadStrideInfo(1, 1, 0, 0),
+                                                                                                                    PadStrideInfo(1, 1, 0, 0),
+                                                                                                                    PadStrideInfo(2, 1, 0, 0),
+                                                                                                                    PadStrideInfo(3, 2, 1, 0),
+                                                                                                                    PadStrideInfo(1, 1, 2, 2),
+                                                                                                                    PadStrideInfo(1, 1, 2, 2)
+                                                                                                                                })),
+                                                                                       framework::dataset::make("GpuTarget", { GPUTarget::BIFROST,
+                                                                                                                GPUTarget::MIDGARD,
+                                                                                                                GPUTarget::G71,
+                                                                                                                GPUTarget::G71,
+                                                                                                                GPUTarget::MIDGARD,
+                                                                                                                GPUTarget::BIFROST,
+                                                                                                                GPUTarget::BIFROST,
+                                                                                                                GPUTarget::BIFROST
+                                                                                                                             })),
+                                                                                   framework::dataset::make("Dilation",
 {
-    ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
-                                                                            &weights_info.clone()->set_is_resizable(false),
-                                                                            &biases_info.clone()->set_is_resizable(false),
-                                                                            &output_info.clone()->set_is_resizable(false), conv_info, WeightsInfo(), gpu_target);
+    Size2D(1U, 1U),
+    Size2D(1U, 1U),
+    Size2D(1U, 1U),
+    Size2D(1U, 1U),
+    Size2D(1U, 1U),
+    Size2D(1U, 1U),
+    Size2D(1U, 1U),
+    Size2D(2U, 1U),
+})),
+framework::dataset::make("EnableFastMath", { false, false, false, false, false, false, true, true })),
+framework::dataset::make("Expected",
+{
+    ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM,
+})),
+input_info, weights_info, output_info, conv_info, gpu_target, dilation, enable_fast_math, expected)
+{
+    ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
+                                                                            &weights_info.clone()->set_is_resizable(true),
+                                                                            &output_info.clone()->set_is_resizable(true), conv_info,
+                                                                            WeightsInfo(),
+                                                                            ActivationLayerInfo(),
+                                                                            gpu_target,
+                                                                            dilation,
+                                                                            enable_fast_math);
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 TEST_SUITE_END()
 
 TEST_SUITE(GEMMConvolutionLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallConvolutionLayerDataset(), datasets::LargeConvolutionLayerDataset()), CNNDataTypes),
-               input_shape, weights_shape, bias_shape, output_shape, info, data_type)
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallConvolutionLayerDataset(), datasets::LargeConvolutionLayerDataset()),
+                                                                           CNNDataTypes),
+                                                                   ActivationFunctionsDataset),
+               input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type, act_info)
 {
     // Set fixed point position data type allowed
     int fixed_point_position = is_data_type_fixed_point(data_type) ? 3 : 0;
@@ -140,7 +177,7 @@
 
     // Create and configure function
     CLGEMMConvolutionLayer conv;
-    conv.configure(&src, &weights, &bias, &dst, info);
+    conv.configure(&src, &weights, &bias, &dst, info, WeightsInfo(), dilation, act_info);
 
     // Validate valid region
     const ValidRegion src_valid_region     = shape_to_valid_region(input_shape);
@@ -156,6 +193,8 @@
     // Validate QuantizationInfo
     ARM_COMPUTE_EXPECT(src.info()->quantization_info() == src_quantization_info, framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(weights.info()->quantization_info() == weights_quantization_info, framework::LogLevel::ERRORS);
+
+    // Validate padding
 }
 
 template <typename T>
@@ -163,18 +202,24 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallConvolutionLayerDataset(),
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                                                                  framework::dataset::make("ReshapeWeights", { true })),
                                                                                                                  framework::dataset::make("DataType",
-                                                                                                                         DataType::F16)))
+                                                                                                                         DataType::F16)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                 ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeConvolutionLayerDataset(),
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeConvolutionLayerDataset(),
                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                               framework::dataset::make("DataType",
-                                                                                                                       DataType::F16)))
+                                                                                                                       framework::dataset::make("DataType",
+                                                                                                                               DataType::F16)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                               ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
@@ -182,21 +227,27 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallConvolutionLayerDataset(),
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                                                                   framework::dataset::make("ReshapeWeights", { true })),
                                                                                                                   framework::dataset::make("DataType",
-                                                                                                                          DataType::F32)))
+                                                                                                                          DataType::F32)),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                  ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeConvolutionLayerDataset(),
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeConvolutionLayerDataset(),
                                                                                                                         framework::dataset::make("ReshapeWeights", { true })),
-                                                                                                                framework::dataset::make("DataType",
-                                                                                                                        DataType::F32)))
+                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                                DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                ActivationFunctionsDataset))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f32);
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, absolute_tolerance_float);
 }
 TEST_SUITE_END()
 TEST_SUITE_END()
@@ -207,20 +258,23 @@
 TEST_SUITE(FixedPoint)
 TEST_SUITE(QS8)
 // We test for fixed point precision [4,6]
-FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::TinyConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::TinyConvolutionLayerDataset(),
                        framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType",
                                                 DataType::QS8)),
-                       framework::dataset::make("FractionalBits", 4, 7)))
+                       framework::dataset::make("FractionalBits", 4, 7)),
+                       ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fixed);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                        framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType",
                                                 DataType::QS8)),
-                       framework::dataset::make("FractionalBits", 4, 7)))
+                       framework::dataset::make("FractionalBits", 4, 7)),
+                       ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fixed);
@@ -229,20 +283,23 @@
 
 TEST_SUITE(QS16)
 // Testing for fixed point position [1,14)
-FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::TinyConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::TinyConvolutionLayerDataset(),
                        framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType",
                                                 DataType::QS16)),
-                       framework::dataset::make("FractionalBits", 1, 14)))
+                       framework::dataset::make("FractionalBits", 1, 14)),
+                       ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fixed);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                        framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType",
                                                 DataType::QS16)),
-                       framework::dataset::make("FractionalBits", 1, 14)))
+                       framework::dataset::make("FractionalBits", 1, 14)),
+                       ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fixed);
@@ -253,20 +310,30 @@
 template <typename T>
 using CLGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
 
+const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                        framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })))
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                       QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeConvolutionLayerDataset(),
                        framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 0) })))
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 0) })),
+                       QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);

diff --git a/tests/validation/CL/Copy.cpp b/tests/validation/CL/Copy.cpp
new file mode 100644
index 0000000..033f7a6
--- /dev/null
+++ b/tests/validation/CL/Copy.cpp

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/CopyFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(Copy)
+
+template <typename T>
+using CLCopyFixture = CopyFixture<CLTensor, CLAccessor, CLCopy, T>;
+
+TEST_SUITE(F32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLCopyFixture<float>, framework::DatasetMode::PRECOMMIT, combine(zip(datasets::SmallShapes(), datasets::SmallShapes()), framework::dataset::make("DataType",
+                                                                                                  DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLCopyFixture<float>, framework::DatasetMode::NIGHTLY, combine(zip(datasets::LargeShapes(), datasets::LargeShapes()), framework::dataset::make("DataType",
+                                                                                                DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // F32
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLCopyFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(zip(datasets::SmallShapes(), datasets::SmallShapes()), framework::dataset::make("DataType",
+                                                                                                    DataType::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLCopyFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(zip(datasets::LargeShapes(), datasets::LargeShapes()), framework::dataset::make("DataType",
+                                                                                                  DataType::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
+
+TEST_SUITE(U16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLCopyFixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(zip(datasets::SmallShapes(), datasets::SmallShapes()), framework::dataset::make("DataType",
+                                                                                                     DataType::U16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLCopyFixture<uint16_t>, framework::DatasetMode::NIGHTLY, combine(zip(datasets::LargeShapes(), datasets::LargeShapes()), framework::dataset::make("DataType",
+                                                                                                   DataType::U16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // U16
+
+TEST_SUITE_END() // Copy
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index 8ac882c..093d342 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp

@@ -45,60 +45,262 @@
 RelativeTolerance<half_float::half>  tolerance_f16(half_float::half(0.001)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr RelativeTolerance<float>   tolerance_f32(0.01f);                   /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);                   /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+constexpr float                      tolerance_num = 0.05f;                  /**< Tolerance number */
+
+const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 3 });
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(DepthwiseConvolutionLayer)
 
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32, 0),     // Mismatching data type input/weights
+                                                       TensorInfo(TensorShape(32U, 18U, 3U), 1, DataType::F32, 0),     // Mismatching input feature maps
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32, 0),     // Unsupported weights dimensions
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::QASYMM8, 0), // Unsupported activation
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32, 0),     // Mismatching depth multiplier
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32, 0),     // Invalid stride
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32, 0),     // Invalid biases size
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32, 0),     // Invalid biases dimensions
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32, 0),     // Invalid output size
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),     // Window shrink
+                                                       TensorInfo(TensorShape(32U, 18U, 8U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(50U, 32U, 8U), 1, DataType::QASYMM8, 0),
+                                                     }),
+               framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16, 0),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(5U, 5U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::QASYMM8, 0),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::QASYMM8, 0),
+                                                       })),
+               framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::S32, 0),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(4U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(2U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(24U), 1, DataType::S32, 0),
+                                                      })),
+               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::QASYMM8, 0),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(30U, 16U, 16U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(48U, 30U, 24U), 1, DataType::QASYMM8, 0),
+                                                      })),
+               framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(4, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                     })),
+               framework::dataset::make("DepthMultiplier", { 1,
+                                                             1,
+                                                             1,
+                                                             1,
+                                                             3,
+                                                             1,
+                                                             1,
+                                                             1,
+                                                             1,
+                                                             1,
+                                                             2,
+                                                             3,
+                                                            })),
+                framework::dataset::make("ActivationInfo", { ActivationLayerInfo(),
+                                                             ActivationLayerInfo(),
+                                                             ActivationLayerInfo(),
+                                                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR),
+                                                             ActivationLayerInfo(),
+                                                             ActivationLayerInfo(),
+                                                             ActivationLayerInfo(),
+                                                             ActivationLayerInfo(),
+                                                             ActivationLayerInfo(),
+                                                             ActivationLayerInfo(),
+                                                             ActivationLayerInfo(),
+                                                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                           })),
+               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true, true })),
+               input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier, act_info, expected)
+{
+    bool is_valid = bool(CLDepthwiseConvolutionLayer3x3::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier, act_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),    // Mismatching data type input/weights
+                                                        TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32, 0),    // Mismatching input feature maps
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),    // Mismatching depth multiplier
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),    // Invalid biases size
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),    // Invalid biases dimensions
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),    // Invalid output size
+                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::QASYMM8, 0),
+                                                      }),
+                framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16, 0),
+                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32, 0),
+                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32, 0),
+                                                          TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::QASYMM8, 0),
+                                                        })),
+                framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(4U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(2U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32, 0),
+                                                       })),
+                framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32, 0),
+                                                         TensorInfo(TensorShape(32U, 11U, 24U), 1, DataType::QASYMM8, 0),
+                                                       })),
+                framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 1, 0),
+                                                      })),
+                framework::dataset::make("DepthMultiplier", { 1,
+                                                              1,
+                                                              3,
+                                                              1,
+                                                              1,
+                                                              1,
+                                                              2,
+                                                              3,
+                                                             })),
+                framework::dataset::make("Expected", { false, false, false, false, false, false, true, true })),
+                input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier, expected)
+{
+    bool is_valid = bool(CLDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
 template <typename T>
 using CLDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer, T>;
 
-TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset(), framework::dataset::make("DataType",
-                                                                                                                 DataType::F32)))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
-                                                                                                                     framework::dataset::make("DataType",
-                                                                                                                             DataType::F32)))
-{
-    validate(CLAccessor(_target), _reference, tolerance_f32);
-}
-TEST_SUITE_END()
-
 template <typename T>
 using CLDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLDepthwiseConvolutionLayer3x3, T>;
 
 TEST_SUITE(Float)
-TEST_SUITE(F16)
+TEST_SUITE(FP16)
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                                                                   framework::dataset::make("DataType",
-                                                                                                                           DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                                          datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
+                                               depth_multipliers),
+                                       framework::dataset::make("DataType",
+                                                                DataType::F16)),
+                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                       depth_multipliers),
                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                               DataType::F16)))
+                                                                                                                               DataType::F16)),
+                                                                                                                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 TEST_SUITE_END()
+
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), depth_multipliers),
+                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                                DataType::F16)),
+                                                                                                                framework::dataset::make("DataLayout", DataLayout::NCHW)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                                                                                    depth_multipliers),
+                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                            DataType::F16)),
+                                                                                                                    framework::dataset::make("DataLayout", DataLayout::NCHW)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END()
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                            DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                                          datasets::SmallDepthwiseConvolutionLayerDataset3x3NCHW()),
+                                               depth_multipliers),
+                                       framework::dataset::make("DataType",
+                                                                DataType::F32)),
+                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                        depth_multipliers),
                                                                                                                         framework::dataset::make("DataType",
-                                                                                                                                DataType::F32)))
+                                                                                                                                DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(Generic)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), depth_multipliers),
+                                                                                                                 framework::dataset::make("DataType",
+                                                                                                                         DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", DataLayout::NCHW)))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                                                                                     depth_multipliers),
+                                                                                                                     framework::dataset::make("DataType",
+                                                                                                                             DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
@@ -114,29 +316,41 @@
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                                                       depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                       depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END()
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                       framework::dataset::make("DepthMultiplier", 1)), // COMPMID-1071 Add depth multiplier support for NHWC
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                       framework::dataset::make("DepthMultiplier", 1)), // COMPMID-1071 Add depth multiplier support for NHWC
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }

diff --git a/tests/validation/CL/DilatedConvolutionLayer.cpp b/tests/validation/CL/DilatedConvolutionLayer.cpp
new file mode 100644
index 0000000..18d0fa8
--- /dev/null
+++ b/tests/validation/CL/DilatedConvolutionLayer.cpp

@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONCLCTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/DilatedConvolutionLayerDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ConvolutionLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float>            tolerance_f32(0.05f);                 /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr AbsoluteTolerance<float>  tolerance_fixed(1.0f);                /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
+constexpr AbsoluteTolerance<float>  tolerance_qasymm8(0.0);               /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+constexpr float                     tolerance_num = 0.07f;                /**< Tolerance number */
+
+/** CNN data types */
+const auto CNNDataTypes = framework::dataset::make("DataType",
+{
+    DataType::F16,
+    DataType::F32,
+    DataType::QS8,
+    DataType::QS16,
+    DataType::QASYMM8,
+});
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(DilatedConvolutionLayer)
+
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+                                                                                               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(23U, 27U, 23U, 4U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
+                                                                                                                                     }),
+                                                                                               framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(3U, 3U, 23U, 21U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+                                                                                                                        TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
+                                                                                                                                       })),
+                                                                                           framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0)
+                                                                                                                                  })),
+                                                                                       framework::dataset::make("ConvInfo", { PadStrideInfo(1, 2, 1, 1),
+                                                                                                                PadStrideInfo(1, 2, 1, 1),
+                                                                                                                PadStrideInfo(1, 1, 0, 0),
+                                                                                                                PadStrideInfo(2, 1, 0, 0),
+                                                                                                                PadStrideInfo(3, 2, 1, 0)
+                                                                                                                            })),
+                                                                                   framework::dataset::make("GpuTarget", { GPUTarget::BIFROST,
+                                                                                                            GPUTarget::MIDGARD,
+                                                                                                            GPUTarget::G71,
+                                                                                                            GPUTarget::MIDGARD,
+                                                                                                            GPUTarget::BIFROST
+                                                                                                                         })),
+                                                                               framework::dataset::make("Dilation", { Size2D(1U, 1U),
+                                                                                                                      Size2D(1U, 1U),
+                                                                                                                      Size2D(1U, 1U),
+                                                                                                                      Size2D(2U, 2U),
+                                                                                                                      Size2D(3U, 3U)
+                                                                                                                    })),
+
+                                                                           framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
+               input_info, weights_info, output_info, conv_info, gpu_target, dilation, expected)
+{
+    ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
+                                                                            &weights_info.clone()->set_is_resizable(true),
+                                                                            &output_info.clone()->set_is_resizable(true), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target, dilation);
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(GEMMDilatedConvolutionLayer)
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallDilatedConvolutionLayerDataset(), datasets::LargeDilatedConvolutionLayerDataset()),
+                                                                   CNNDataTypes),
+               input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type)
+{
+    // Set fixed point position data type allowed
+    int fixed_point_position = is_data_type_fixed_point(data_type) ? 3 : 0;
+
+    auto bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+
+    // Create tensors
+    CLTensor src     = create_tensor<CLTensor>(input_shape, data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127));
+    CLTensor weights = create_tensor<CLTensor>(weights_shape, data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127));
+    CLTensor bias    = create_tensor<CLTensor>(bias_shape, bias_data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127));
+    CLTensor dst     = create_tensor<CLTensor>(output_shape, data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127));
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    const QuantizationInfo src_quantization_info     = src.info()->quantization_info();
+    const QuantizationInfo weights_quantization_info = weights.info()->quantization_info();
+
+    // Create and configure function
+    CLGEMMConvolutionLayer conv;
+    conv.configure(&src, &weights, &bias, &dst, info, WeightsInfo(), dilation);
+
+    // Validate valid region
+    const ValidRegion src_valid_region     = shape_to_valid_region(input_shape);
+    const ValidRegion weights_valid_region = shape_to_valid_region(weights_shape);
+    const ValidRegion bias_valid_region    = shape_to_valid_region(bias_shape);
+    const ValidRegion dst_valid_region     = shape_to_valid_region(output_shape);
+
+    validate(src.info()->valid_region(), src_valid_region);
+    validate(weights.info()->valid_region(), weights_valid_region);
+    validate(bias.info()->valid_region(), bias_valid_region);
+    validate(dst.info()->valid_region(), dst_valid_region);
+
+    // Validate QuantizationInfo
+    ARM_COMPUTE_EXPECT(src.info()->quantization_info() == src_quantization_info, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(weights.info()->quantization_info() == weights_quantization_info, framework::LogLevel::ERRORS);
+
+    // Validate padding
+}
+
+template <typename T>
+using CLGEMMDilatedConvolutionLayerFixture = ConvolutionValidationFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMDilatedConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
+                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                        framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMDilatedConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDilatedConvolutionLayerDataset(),
+                                                                                                                      framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                      framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMDilatedConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
+                       framework::dataset::make("ReshapeWeights", { true })),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                       framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMDilatedConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDilatedConvolutionLayerDataset(),
+                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                       framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, 0.00002);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+template <typename T>
+using CLGEMMDilatedConvolutionLayerFixedPointFixture = ConvolutionValidationFixedPointFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
+
+TEST_SUITE(FixedPoint)
+TEST_SUITE(QS8)
+// We test for fixed point precision [4,6]
+FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMDilatedConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::TinyDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType",
+                                                                        DataType::QS8)),
+                                       framework::dataset::make("FractionalBits", 4, 7)),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_fixed);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMDilatedConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType",
+                                                                        DataType::QS8)),
+                                       framework::dataset::make("FractionalBits", 4, 7)),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_fixed);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(QS16)
+// Testing for fixed point position [1,14)
+FIXTURE_DATA_TEST_CASE(RunTiny, CLGEMMDilatedConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::TinyDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType",
+                                                                        DataType::QS16)),
+                                       framework::dataset::make("FractionalBits", 1, 14)),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_fixed);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMDilatedConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType",
+                                                                        DataType::QS16)),
+                                       framework::dataset::make("FractionalBits", 1, 14)),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_fixed);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+template <typename T>
+using CLGEMMDilatedConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMDilatedConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMDilatedConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::LargeDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 0) })),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/DirectConvolutionLayer.cpp b/tests/validation/CL/DirectConvolutionLayer.cpp
index 12f3d3d..00a01b0 100644
--- a/tests/validation/CL/DirectConvolutionLayer.cpp
+++ b/tests/validation/CL/DirectConvolutionLayer.cpp

@@ -43,6 +43,7 @@
 {
 namespace
 {
+// COMPMID-517 Invesitgate the mismatch to see whether it is a real bug
 RelativeTolerance<half>  tolerance_fp16(half(0.2)); /**< Tolerance for floating point tests */
 RelativeTolerance<float> tolerance_fp32(0.02f);     /**< Tolerance for floating point tests */
 constexpr float          tolerance_num = 0.07f;     /**< Tolerance number */
@@ -72,6 +73,14 @@
                                                                              combine(framework::dataset::make("PadY", 0, 2),
                                                                                      framework::dataset::make("KernelSize", { 3 })))),
                                                               framework::dataset::make("NumKernels", { 1, 4, 8, 16 })))));
+/** Activation function Dataset*/
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+});
 } // namespace
 
 TEST_SUITE(CL)
@@ -79,7 +88,7 @@
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0), // Mismatching data type input/weights
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0), // Mismatching input feature maps
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0), // Unsupported kernel width
@@ -140,10 +149,14 @@
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                       })),
-               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
-               input_info, weights_info, biases_info, output_info, conv_info, expected)
+                       framework::dataset::make("ActivationInfo",
 {
-    bool is_valid = bool(CLDirectConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info));
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})),
+               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, false, true })),
+               input_info, weights_info, biases_info, output_info, conv_info, act_info, expected)
+{
+    bool is_valid = bool(CLDirectConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, act_info));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
@@ -156,7 +169,9 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(data, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(data, framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                ActivationFunctionsDataset),
+                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, tolerance_num);
@@ -164,7 +179,9 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(data, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(data, framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                 ActivationFunctionsDataset),
+                                                                                                         framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -172,8 +189,9 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP32_CustomDataset)
-FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionValidationWithTensorShapesFixture<float>, framework::DatasetMode::ALL, combine(datasets::DirectConvolutionLayerDataset(),
-                       framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionValidationWithTensorShapesFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::DirectConvolutionLayerDataset(),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp32);
@@ -186,8 +204,10 @@
 
 TEST_SUITE(FixedPoint)
 TEST_SUITE(QS8)
-FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(data_fixed_point, framework::dataset::make("DataType", DataType::QS8)),
-                                                                                                                    framework::dataset::make("FractionalBits", 2, 7)))
+FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(data_fixed_point, framework::dataset::make("DataType",
+                                                                                                                    DataType::QS8)),
+                                                                                                                    framework::dataset::make("FractionalBits", 2, 7)),
+                                                                                                                    ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qs8);
@@ -195,8 +215,10 @@
 TEST_SUITE_END()
 
 TEST_SUITE(QS16)
-FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(data_fixed_point, framework::dataset::make("DataType", DataType::QS16)),
-                                                                                                                     framework::dataset::make("FractionalBits", 2, 15)))
+FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(combine(data_fixed_point, framework::dataset::make("DataType",
+                                                                                                                     DataType::QS16)),
+                                                                                                                     framework::dataset::make("FractionalBits", 2, 15)),
+                                                                                                                     ActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qs16);
@@ -209,10 +231,17 @@
 template <typename T>
 using CLDirectConvolutionValidationWithTensorShapesQuantizedFixture = DirectConvolutionValidationWithTensorShapesQuantizedFixture<CLTensor, CLAccessor, CLDirectConvolutionLayer, T>;
 
+const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(data, framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                    framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) })))
+FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(data, framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                    framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10) })),
+                                                                                                                    QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -220,9 +249,10 @@
 TEST_SUITE_END()
 
 TEST_SUITE(QASYMM8_CustomDataset)
-FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::DirectConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionValidationWithTensorShapesQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::DirectConvolutionLayerDataset(),
                        framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127) })))
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127) })),
+                       QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);

diff --git a/tests/validation/CL/GaussianPyramid.cpp b/tests/validation/CL/GaussianPyramid.cpp
new file mode 100644
index 0000000..2a4596d
--- /dev/null
+++ b/tests/validation/CL/GaussianPyramid.cpp

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+*/
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/BorderModeDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/GaussianPyramidHalfFixture.h"
+#include "tests/validation/reference/Utils.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+const auto small_gaussian_pyramid_levels = combine(datasets::Medium2DShapes(), datasets::BorderModes()) * framework::dataset::make("numlevels", 2, 4);
+const auto large_gaussian_pyramid_levels = combine(datasets::Large2DShapes(), datasets::BorderModes()) * framework::dataset::make("numlevels", 2, 5);
+
+template <typename T>
+inline void validate_gaussian_pyramid(const CLPyramid &target, const std::vector<SimpleTensor<T>> &reference, BorderMode border_mode)
+{
+    ValidRegion prev_valid_region = shape_to_valid_region(reference[0].shape());
+
+    for(size_t i = 1; i < reference.size(); ++i)
+    {
+        const ValidRegion valid_region = shape_to_valid_region_gaussian_pyramid_half(reference[i - 1].shape(), prev_valid_region, (border_mode == BorderMode::UNDEFINED));
+
+        // Validate outputs
+        validate(CLAccessor(*(target.get_pyramid_level(i))), reference[i], valid_region);
+
+        // Keep the valid region for the next level
+        prev_valid_region = valid_region;
+    }
+}
+
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(GaussianPyramid)
+TEST_SUITE(Half)
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, large_gaussian_pyramid_levels,
+               shape, border_mode, num_levels)
+{
+    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8);
+
+    // Create pyramid
+    PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, shape, Format::U8);
+    CLPyramid   dst;
+    dst.init(pyramid_info);
+
+    CLGaussianPyramidHalf gaussian_pyramid_half;
+    gaussian_pyramid_half.configure(&src, &dst, border_mode, 0);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    for(size_t level = 0; level < pyramid_info.num_levels(); ++level)
+    {
+        ARM_COMPUTE_EXPECT(dst.get_pyramid_level(level)->info()->is_resizable(), framework::LogLevel::ERRORS);
+    }
+}
+
+template <typename T>
+using CLGaussianPyramidHalfFixture = GaussianPyramidHalfValidationFixture<CLTensor, CLAccessor, CLGaussianPyramidHalf, T, CLPyramid>;
+
+FIXTURE_DATA_TEST_CASE(RunSmallGaussianPyramidHalf, CLGaussianPyramidHalfFixture<uint8_t>, framework::DatasetMode::ALL, small_gaussian_pyramid_levels)
+{
+    validate_gaussian_pyramid(_target, _reference, _border_mode);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeGaussianPyramidHalf, CLGaussianPyramidHalfFixture<uint8_t>, framework::DatasetMode::NIGHTLY, large_gaussian_pyramid_levels)
+{
+    validate_gaussian_pyramid(_target, _reference, _border_mode);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/GlobalPoolingLayer.cpp b/tests/validation/CL/GlobalPoolingLayer.cpp
index 31e3fe0..586be5e 100644
--- a/tests/validation/CL/GlobalPoolingLayer.cpp
+++ b/tests/validation/CL/GlobalPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,7 +59,9 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunGlobalPooling, CLGlobalPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunGlobalPooling, CLGlobalPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType",
+                                                                                                                  DataType::F32)),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -67,8 +69,9 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunGlobalPooling, CLGlobalPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType",
-                                                                                                                 DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunGlobalPooling, CLGlobalPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType",
+                                                                                                                 DataType::F16)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);

diff --git a/tests/validation/CL/HOGDetector.cpp b/tests/validation/CL/HOGDetector.cpp
new file mode 100644
index 0000000..6c2c18c
--- /dev/null
+++ b/tests/validation/CL/HOGDetector.cpp

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/CL/CLArrayAccessor.h"
+#include "tests/CL/CLHOGAccessor.h"
+#include "tests/datasets/HOGDescriptorDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/HOGDetectorFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/* Set the tolerance (percentage) used when validating the score of detection window. */
+RelativeTolerance<float> tolerance(0.01f);
+
+/* Input dataset (values must be a multiple of the HOGInfo block_size) */
+const auto DetectionWindowStrideDataset = framework::dataset::make("DetectionWindowStride", { Size2D(8, 8), Size2D(16, 16) });
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(HOGDetector)
+
+// *INDENT-OFF*
+// clang-format off
+using CLHOGDetectorFixture = HOGDetectorValidationFixture<CLTensor,
+                                                          CLHOG,
+                                                          CLDetectionWindowArray,
+                                                          CLHOGDescriptor,
+                                                          CLAccessor,
+                                                          CLArrayAccessor<DetectionWindow>,
+                                                          CLHOGAccessor,
+                                                          CLHOGDetector,
+                                                          uint8_t,
+                                                          float>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLHOGDetectorFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(
+                       DetectionWindowStrideDataset,
+                       datasets::SmallHOGDescriptorDataset()),
+                       framework::dataset::make("Format", Format::U8)),
+                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
+
+{
+    // Validate output
+    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLHOGDetectorFixture, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(
+                       DetectionWindowStrideDataset,
+                       datasets::LargeHOGDescriptorDataset()),
+                       framework::dataset::make("Format", Format::U8)),
+                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
+{
+    // Validate output
+    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
+}
+
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/HOGMultiDetection.cpp b/tests/validation/CL/HOGMultiDetection.cpp
new file mode 100644
index 0000000..634af41
--- /dev/null
+++ b/tests/validation/CL/HOGMultiDetection.cpp

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMultiHOG.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/CL/CLArrayAccessor.h"
+#include "tests/CL/CLHOGAccessor.h"
+#include "tests/datasets/HOGMultiDetectionDataset.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/HOGMultiDetectionFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/* Set the tolerance (percentage) used when validating the strength of detection window. */
+RelativeTolerance<float> tolerance(0.1f);
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(HOGMultiDetection)
+
+// *INDENT-OFF*
+// clang-format off
+using CLHOGMultiDetectionFixture = HOGMultiDetectionValidationFixture<CLTensor,
+                                                                      CLHOG,
+                                                                      CLMultiHOG,
+                                                                      CLDetectionWindowArray,
+                                                                      CLSize2DArray,
+                                                                      CLAccessor,
+                                                                      CLArrayAccessor<Size2D>,
+                                                                      CLArrayAccessor<DetectionWindow>,
+                                                                      CLHOGAccessor,
+                                                                      CLHOGMultiDetection,
+                                                                      uint8_t,
+                                                                      float>;
+
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLHOGMultiDetectionFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(
+                       datasets::SmallHOGMultiDetectionDataset(),
+                       framework::dataset::make("Format", Format::U8)),
+                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})),
+                       framework::dataset::make("NonMaximaSuppression", {false, true})))
+{
+    // Validate output
+    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLHOGMultiDetectionFixture, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(
+                       datasets::LargeHOGMultiDetectionDataset(),
+                       framework::dataset::make("Format", Format::U8)),
+                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})),
+                       framework::dataset::make("NonMaximaSuppression", {false, true})))
+{
+    // Validate output
+    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
+}
+
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/L2NormalizeLayer.cpp b/tests/validation/CL/L2NormalizeLayer.cpp
index bc2374b..3d121b0 100644
--- a/tests/validation/CL/L2NormalizeLayer.cpp
+++ b/tests/validation/CL/L2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,6 +50,37 @@
 TEST_SUITE(CL)
 TEST_SUITE(L2NormalizeLayer)
 
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+    framework::dataset::make("InputInfo",  { TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Mismatching data type input/output
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Mismatching shape input/output
+                                             TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F32
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 0
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
+                                           }),
+    framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(128U, 64U), 1, DataType::F16),
+                                             TensorInfo(TensorShape(256U, 64U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::S16),
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
+                                           })),
+    framework::dataset::make("Axis",       { 0U, 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 1U, 0U })),
+    framework::dataset::make("Expected",   { false, false, false, false, false, false, true })),
+    input_info, output_info, axis, expected)
+{
+    bool is_valid = bool(CLL2NormalizeLayer::validate(&input_info.clone()->set_is_resizable(false),
+                                                      &output_info.clone()->set_is_resizable(false),
+                                                      axis));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
 template <typename T>
 using CLL2NormalizeLayerFixture = L2NormalizeLayerValidationFixture<CLTensor, CLAccessor, CLL2NormalizeLayer, T>;
 

diff --git a/tests/validation/CL/LSTMLayer.cpp b/tests/validation/CL/LSTMLayer.cpp
new file mode 100644
index 0000000..bd43678
--- /dev/null
+++ b/tests/validation/CL/LSTMLayer.cpp

@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/LSTMLayerDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/LSTMLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> tolerance_f32(0.001f);
+RelativeTolerance<half>  tolerance_f16(half(0.1));
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(LSTMLayer)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(8U, 2U), 1, DataType::U8, 0),      // Wrong data type
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0), // Wrong input size
+                                                       TensorInfo(TensorShape(8U, 2U), 1, DataType::F32, 0),     // Wrong input weights size
+                                                       TensorInfo(TensorShape(8U, 2U), 1, DataType::F32, 0),     // Wrong recurrent weights size
+                                                       TensorInfo(TensorShape(8U, 2U), 1, DataType::F32, 0),     // Wrong cell bias size
+                                                       TensorInfo(TensorShape(8U, 2U), 1, DataType::F32, 0),     // Wrong cell state size
+                                                       TensorInfo(TensorShape(8U, 2U), 1, DataType::F32, 0),     // Wrong output size
+                                                       TensorInfo(TensorShape(8U, 2U), 1, DataType::F32, 0),     // Wrong scratch size
+               }),
+               framework::dataset::make("InputWeightsInfo", { TensorInfo(TensorShape(8U, 16U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(8U, 16U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(27U, 11U, 2U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(8U, 16U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(8U, 16U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(8U, 16U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(8U, 16U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(8U, 16U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("RecurrentWeightsInfo", { TensorInfo(TensorShape(16U, 16U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(16U, 16U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(16U, 16U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(16U, 16U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(16U, 16U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(16U, 16U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(16U, 16U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("CellBiasInfo", { TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(30U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("ProjectionBiasInfo", { TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(16U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("CellStateInfo", { TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(11U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 2U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("ScratchInfo", { TensorInfo(TensorShape(64U, 2U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(64U, 2U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(64U, 2U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(64U, 2U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(64U, 2U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(64U, 2U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(64U, 2U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(12U, 2U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("ActivationInfo", { ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+               })),
+               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false })),
+               input_info, input_weights_info, recurrent_weights_info, cell_bias_info, projection_bias_info, cell_state_info, output_info, scratch_info, info, expected)
+{
+    LSTMParams<ITensorInfo> lstm_params_info;
+    lstm_params_info.set_peephole_params(&cell_bias_info, &cell_bias_info, &cell_bias_info)
+                    .set_projection_params(&recurrent_weights_info, &projection_bias_info)
+                    .set_cifg_params(&input_weights_info, &recurrent_weights_info, &cell_bias_info, &cell_bias_info);
+
+    ARM_COMPUTE_EXPECT(bool(CLLSTMLayer::validate(&input_info.clone()->set_is_resizable(false), &input_weights_info.clone()->set_is_resizable(false), &input_weights_info.clone()->set_is_resizable(false),
+                                                  &input_weights_info.clone()->set_is_resizable(false), &recurrent_weights_info.clone()->set_is_resizable(false), &recurrent_weights_info.clone()->set_is_resizable(false),
+                                                  &recurrent_weights_info.clone()->set_is_resizable(false), &cell_bias_info.clone()->set_is_resizable(false), &cell_bias_info.clone()->set_is_resizable(false),
+                                                  &cell_bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), &cell_state_info.clone()->set_is_resizable(false),
+                                                  &scratch_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), lstm_params_info, info, 0.05, 0.9)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using CLLSTMLayerFixture = LSTMLayerValidationFixture<CLTensor, CLAccessor, CLLSTMLayer, LSTMParams<ICLTensor>, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLLSTMLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallLSTMLayerDataset(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F32)),
+                                                                                                         framework::dataset::make("ProjectionOpt", { true, false })),
+                                                                                                 framework::dataset::make("PeepholeOpt", { true, false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLLSTMLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallLSTMLayerDataset(), framework::dataset::make("DataType", DataType::F16)),
+                                                                                                        framework::dataset::make("ProjectionOpt", { true, false })),
+                                                                                                framework::dataset::make("PeepholeOpt", { true, false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // LSTMLayer
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/LocallyConnected.cpp b/tests/validation/CL/LocallyConnected.cpp
new file mode 100644
index 0000000..d8f236c
--- /dev/null
+++ b/tests/validation/CL/LocallyConnected.cpp

@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/LocallyConnectedDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/LocallyConnectedFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> atolerance_f32(0.00001f); /**< Absolute Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+RelativeTolerance<float>           rtolerance_f32(0.05f);    /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(LocallyConnected)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+    framework::dataset::make("InputInfo",  { TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching data type input/weights
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching data type input/bias
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching data type input/output
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching shape input/weights
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching shape input/bias
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching shape input/output
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Asymmetric padding
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0)
+                                           }),
+    framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F16, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 274U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0)
+                                           })),
+    framework::dataset::make("BiasInfo",   { TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F16, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 274U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0)
+                                           })),
+    framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F16, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 22U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0)
+                                           })),
+    framework::dataset::make("PadStride",  { PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 1, 0, 0, 0, DimensionRoundingType::FLOOR),
+                                             PadStrideInfo(2, 1, 0, 0)
+                                           })),
+    framework::dataset::make("Expected", { false, false, false, false, false, false, false, true })),
+    input_info, weights_info, bias_info, output_info, conv_info, expected)
+{
+    bool is_valid = bool(CLLocallyConnectedLayer::validate(&input_info.clone()->set_is_resizable(false),
+                                                           &weights_info.clone()->set_is_resizable(false),
+                                                           &bias_info.clone()->set_is_resizable(false),
+                                                           &output_info.clone()->set_is_resizable(false),
+                                                           conv_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallLocallyConnectedDataset(), datasets::LargeLocallyConnectedDataset()),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+               src_shape, weights_shape, bias_shape, dst_shape, info, dilation, data_type)
+{
+    ARM_COMPUTE_UNUSED(dilation);
+
+    // Create tensors
+    CLTensor src     = create_tensor<CLTensor>(src_shape, data_type);
+    CLTensor weights = create_tensor<CLTensor>(weights_shape, data_type);
+    CLTensor bias    = create_tensor<CLTensor>(bias_shape, data_type);
+    CLTensor dst     = create_tensor<CLTensor>(dst_shape, data_type);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function.
+    CLLocallyConnectedLayer lc;
+    lc.configure(&src, &weights, &bias, &dst, info);
+
+    // Validate valid region
+    const ValidRegion dst_valid_region = shape_to_valid_region(dst_shape);
+    validate(dst.info()->valid_region(), dst_valid_region);
+}
+
+template <typename T>
+using CLLocallyConnectedFixture = LocallyConnectedValidationFixture<CLTensor, CLAccessor, CLLocallyConnectedLayer, T>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CLLocallyConnectedFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallLocallyConnectedDataset(),
+                                                                                                              framework::dataset::make("DataType",
+                                                                                                                      DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rtolerance_f32, 0.f, atolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLLocallyConnectedFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeLocallyConnectedDataset(),
+                                                                                                            framework::dataset::make("DataType",
+                                                                                                                    DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rtolerance_f32, 0.f, atolerance_f32);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/OpticalFlow.cpp b/tests/validation/CL/OpticalFlow.cpp
new file mode 100644
index 0000000..006d40a
--- /dev/null
+++ b/tests/validation/CL/OpticalFlow.cpp

@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/CL/CLArrayAccessor.h"
+#include "tests/datasets/BorderModeDataset.h"
+#include "tests/datasets/OpticalFlowDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/OpticalFlowFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(OpticalFlow)
+
+// *INDENT-OFF*
+// clang-format off
+using CLOpticalFlowFixture = OpticalFlowValidationFixture<CLTensor,
+                                                          CLAccessor,
+                                                          CLKeyPointArray,
+                                                          CLArrayAccessor<KeyPoint>,
+                                                          CLOpticalFlow,
+                                                          CLPyramid,
+                                                          CLGaussianPyramidHalf,
+                                                          uint8_t>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLOpticalFlowFixture, framework::DatasetMode::PRECOMMIT, combine(combine(
+                       datasets::SmallOpticalFlowDataset(),
+                       framework::dataset::make("Format", Format::U8)),
+                       datasets::BorderModes()))
+{
+    // Validate output
+    CLArrayAccessor<KeyPoint> array(_target);
+
+    validate_keypoints(array.buffer(),
+                       array.buffer() + array.num_values(),
+                       _reference.begin(),
+                       _reference.end());
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLOpticalFlowFixture, framework::DatasetMode::NIGHTLY, combine(combine(
+                       datasets::LargeOpticalFlowDataset(),
+                       framework::dataset::make("Format", Format::U8)),
+                       datasets::BorderModes()))
+{
+    // Validate output
+    CLArrayAccessor<KeyPoint> array(_target);
+
+    validate_keypoints(array.buffer(),
+                       array.buffer() + array.num_values(),
+                       _reference.begin(),
+                       _reference.end());
+}
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/Permute.cpp b/tests/validation/CL/Permute.cpp
index bdd8f6e..1371e71 100644
--- a/tests/validation/CL/Permute.cpp
+++ b/tests/validation/CL/Permute.cpp

@@ -53,7 +53,7 @@
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-                                                framework::dataset::make("InputInfo",{  
+                                                framework::dataset::make("InputInfo",{
                                                                                         TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
                                                                                         TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
                                                                                         TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
@@ -66,27 +66,27 @@
                                                                                         TensorInfo(TensorShape(128U, 64U, 21U, 2U), 1, DataType::F32), // permutation not supported
                                                                                         TensorInfo(TensorShape(128U, 64U, 21U, 2U), 1, DataType::U16), // permutation not supported
                                                                                     }),
-                                                framework::dataset::make("OutputInfo", { 
-                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),     
-                                                                                        TensorInfo(TensorShape(5U, 5U, 7U, 3U), 1, DataType::U16),     
+                                                framework::dataset::make("OutputInfo", {
+                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
+                                                                                        TensorInfo(TensorShape(5U, 5U, 7U, 3U), 1, DataType::U16),
                                                                                         TensorInfo(TensorShape(7U, 7U, 7U, 3U), 1, DataType::U16),
                                                                                         TensorInfo(TensorShape(5U, 7U), 1, DataType::U8),
-                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16), 
-                                                                                        TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32), 
-                                                                                        TensorInfo(TensorShape(2U, 37U, 27U, 13U), 1, DataType::F32), 
+                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
+                                                                                        TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(2U, 37U, 27U, 13U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(128U, 64U, 21U, 2U), 1, DataType::QASYMM8),
                                                                                         TensorInfo(TensorShape(128U, 64U, 21U, 2U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(21U, 64U, 2U, 128U), 1, DataType::F32), 
-                                                                                        TensorInfo(TensorShape(2U, 21U, 64U, 128U), 1, DataType::U16), 
+                                                                                        TensorInfo(TensorShape(21U, 64U, 2U, 128U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(2U, 21U, 64U, 128U), 1, DataType::U16),
                                                                                     })),
-                                                framework::dataset::make("PermutationVector", { 
+                                                framework::dataset::make("PermutationVector", {
                                                                                                 PermutationVector(2U, 1U, 0U),
                                                                                                 PermutationVector(2U, 2U, 1U),
                                                                                                 PermutationVector(1U, 1U, 1U),
                                                                                                 PermutationVector(2U, 0U, 1U),
-                                                                                                PermutationVector(2U, 0U, 1U), 
-                                                                                                PermutationVector(1U, 2U, 0U), 
-                                                                                                PermutationVector(3U, 2U, 0U, 1U), 
+                                                                                                PermutationVector(2U, 0U, 1U),
+                                                                                                PermutationVector(1U, 2U, 0U),
+                                                                                                PermutationVector(3U, 2U, 0U, 1U),
                                                                                                 PermutationVector(2U, 3U, 1U, 0U),
                                                                                                 PermutationVector(1U, 1U, 1U, 1U),
                                                                                                 PermutationVector(2U, 1U, 3U, 0U),

diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index 9da4c55..7bd090c 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp

@@ -57,7 +57,7 @@
 /** Input data set for asymmetric data type */
 const auto PoolingLayerDatasetQASYMM8 = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(5, 7), Size2D(8, 9) })),
                                                         framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })),
-                                                framework::dataset::make("ExcludePadding", { true, false }));
+                                                framework::dataset::make("ExcludePadding", { true }));
 
 constexpr AbsoluteTolerance<float>   tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
 constexpr AbsoluteTolerance<float>   tolerance_f16(0.01f);  /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
@@ -126,35 +126,40 @@
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType",
-                                                                                                    DataType::F32))))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType",
+                                                                                                            DataType::F32))),
+                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType",
-                                                                                                        DataType::F32))))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                                framework::dataset::make("DataType",
+                                                                                                                        DataType::F32))),
+                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP,
-                                                                                                   framework::dataset::make("DataType", DataType::F16))))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                           framework::dataset::make("DataType", DataType::F16))),
+                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
-                                                                                                       framework::dataset::make("DataType", DataType::F16))))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                               framework::dataset::make("DataType", DataType::F16))),
+                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // Float
 
 template <typename T>
 using CLPoolingLayerFixedPointFixture = PoolingLayerValidationFixedPointFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
@@ -175,7 +180,7 @@
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qs8);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // QS8
 
 TEST_SUITE(QS16)
 FIXTURE_DATA_TEST_CASE(RunTiny, CLPoolingLayerFixedPointFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), combine(PoolingLayerDatasetQS,
@@ -192,8 +197,8 @@
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qs16);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // QS16
+TEST_SUITE_END() // fixedPoint
 
 TEST_SUITE(Quantized)
 
@@ -201,27 +206,28 @@
 using CLPoolingLayerQuantizedFixture = PoolingLayerValidationQuantizedFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
 
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8,
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8,
                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8))),
-                                                                                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127),
-                                                                                                                       QuantizationInfo(7.f / 255, 123)
-                                                                                                                                                            })))
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127),
+                                                                                                                               QuantizationInfo(7.f / 255, 123)
+                                                                                                                                                                    })),
+                                                                                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetQASYMM8,
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetQASYMM8,
                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8))),
-                                                                                                                   framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) })))
+                                                                                                                   framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) })),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // PoolingLayer
+TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute

diff --git a/tests/validation/CL/RNNLayer.cpp b/tests/validation/CL/RNNLayer.cpp
new file mode 100644
index 0000000..0af6f8e
--- /dev/null
+++ b/tests/validation/CL/RNNLayer.cpp

@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/RNNLayerDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/RNNLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> tolerance_f32(0.001f);
+RelativeTolerance<half>  tolerance_f16(half(0.1));
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(RNNLayer)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::U8, 0),      // Wrong data type
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0), // Wrong input size
+                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32, 0),     // Wrong weights size
+                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32, 0),     // Wrong recurrent weights size
+                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32, 0),     // Wrong bias size
+                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32, 0),     // Wrong output size
+                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32, 0),     // Wrong hidden output size
+               }),
+               framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(27U, 11U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(27U, 11U, 2U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32, 0),
+                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("RecurrentWeightsInfo", { TensorInfo(TensorShape(11U, 11U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(11U, 11U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(11U, 11U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(11U, 11U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(11U, 11U), 1, DataType::F32, 0),
+                                                                  TensorInfo(TensorShape(11U, 11U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("BiasInfo", { TensorInfo(TensorShape(11U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(11U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(11U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(11U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(30U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(11U), 1, DataType::F32, 0),
+                                                      TensorInfo(TensorShape(11U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(11U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("HiddenStateInfo", { TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(11U, 13U), 1, DataType::F32, 0),
+                                                             TensorInfo(TensorShape(11U, 13U, 2U), 1, DataType::F32, 0),
+               })),
+               framework::dataset::make("ActivationInfo", { ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+                                                            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+               })),
+               framework::dataset::make("Expected", { false, false, false, false, false, false, false })),
+               input_info, weights_info, recurrent_weights_info, bias_info, output_info, hidden_output_info, info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CLRNNLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &recurrent_weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), &hidden_output_info.clone()->set_is_resizable(false), info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using CLRNNLayerFixture = RNNLayerValidationFixture<CLTensor, CLAccessor, CLRNNLayer, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLRNNLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallRNNLayerDataset(), framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLRNNLayerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallRNNLayerDataset(), framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // RNNLayer
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/ReductionOperation.cpp b/tests/validation/CL/ReductionOperation.cpp
index 684ed46..a48e2f9 100644
--- a/tests/validation/CL/ReductionOperation.cpp
+++ b/tests/validation/CL/ReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,6 +50,36 @@
 TEST_SUITE(CL)
 TEST_SUITE(ReductionOperation)
 
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+    framework::dataset::make("InputInfo",          { TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Mismatching data type input/output
+                                                     TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F32
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 0
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
+                                                   }),
+    framework::dataset::make("OutputInfo",         { TensorInfo(TensorShape(1U, 64U), 1, DataType::F16),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::S16),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::F32)
+                                                   })),
+    framework::dataset::make("Axis",               { 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 1U, 0U })),
+    framework::dataset::make("Expected",           { false, false, false, false, false, true })),
+    input_info, output_info, axis, expected)
+{
+    bool is_valid = bool(CLReductionOperation::validate(&input_info.clone()->set_is_resizable(false),
+                                                        &output_info.clone()->set_is_resizable(true),
+                                                        axis,
+                                                        ReductionOperation::SUM_SQUARE));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
 template <typename T>
 using CLReductionOperationFixture = ReductionOperationValidationFixture<CLTensor, CLAccessor, CLReductionOperation, T>;
 

diff --git a/tests/validation/CL/Scale.cpp b/tests/validation/CL/Scale.cpp
index aeda33b..3d8750a 100644
--- a/tests/validation/CL/Scale.cpp
+++ b/tests/validation/CL/Scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,8 +58,10 @@
 /** Tolerance */
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
 constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
-RelativeTolerance<float>             tolerance_f32(0.05);
-RelativeTolerance<half>              tolerance_f16(half(0.1));
+constexpr float                      tolerance_f32_absolute(0.001f);
+
+RelativeTolerance<float> tolerance_f32(0.05);
+RelativeTolerance<half>  tolerance_f16(half(0.1));
 
 constexpr float tolerance_num_f32(0.01f);
 } // namespace
@@ -98,7 +100,7 @@
     const BorderSize border_size(border_mode == BorderMode::UNDEFINED ? 0 : 1);
 
     // Validate valid region
-    const ValidRegion dst_valid_region = calculate_valid_region_scale(*(src.info()), shape_scaled, policy, border_size, (border_mode == BorderMode::UNDEFINED));
+    const ValidRegion dst_valid_region = calculate_valid_region_scale(*(src.info()), shape_scaled, policy, sampling_policy, (border_mode == BorderMode::UNDEFINED));
     validate(dst.info()->valid_region(), dst_valid_region);
 
     // Validate padding
@@ -116,53 +118,60 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                     DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                              framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                      datasets::BorderModes()),
                                                                                              datasets::SamplingPolicies()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                                  framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                          datasets::BorderModes()),
                                                                                                  datasets::SamplingPolicies()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
 }
 TEST_SUITE_END()
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                    DataType::F16)),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                             framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                     datasets::BorderModes()),
                                                                                             datasets::SamplingPolicies()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
                                                                                                                         DataType::F16)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                                 framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                         datasets::BorderModes()),
                                                                                                 datasets::SamplingPolicies()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_f16);
@@ -172,53 +181,60 @@
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                       DataType::U8)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                                framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                        datasets::BorderModes()),
                                                                                                datasets::SamplingPolicies()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_u8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::U8)),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::U8)),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                            datasets::BorderModes()),
                                                                                                    datasets::SamplingPolicies()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_u8);
 }
 TEST_SUITE_END()
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::S16)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                       DataType::S16)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                                framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                        datasets::BorderModes()),
                                                                                                datasets::SamplingPolicies()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_s16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
                                                                                                                    DataType::S16)),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                            datasets::BorderModes()),
                                                                                                    datasets::SamplingPolicies()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_s16);

diff --git a/tests/validation/CL/UNIT/MemoryManager.cpp b/tests/validation/CL/UNIT/MemoryManager.cpp
new file mode 100644
index 0000000..2129c03
--- /dev/null
+++ b/tests/validation/CL/UNIT/MemoryManager.cpp

@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLBufferAllocator.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "support/ToolchainSupport.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/Globals.h"
+#include "tests/Utils.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/UNIT/MemoryManagerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> tolerance_f32(0.05f);
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(UNIT)
+TEST_SUITE(MemoryManager)
+
+using CLBlobMemoryManagerSimpleWithinFunctionLevelFixture = BlobMemoryManagerSimpleTestCaseFixture<CLTensor,
+      CLAccessor,
+      CLBufferAllocator,
+      CLFullyConnectedLayer>;
+FIXTURE_TEST_CASE(BlobMemoryManagerSimpleWithinFunctionLevel,
+                  CLBlobMemoryManagerSimpleWithinFunctionLevelFixture,
+                  framework::DatasetMode::ALL)
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/UNIT/TensorAllocator.cpp b/tests/validation/CL/UNIT/TensorAllocator.cpp
new file mode 100644
index 0000000..a34a37e
--- /dev/null
+++ b/tests/validation/CL/UNIT/TensorAllocator.cpp

@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(UNIT)
+TEST_SUITE(TensorAllocator)
+
+TEST_CASE(ImportMemory, framework::DatasetMode::ALL)
+{
+    // Init tensor info
+    TensorInfo info(TensorShape(24U, 16U, 3U), 1, DataType::F32);
+
+    // Allocate memory
+    auto buf = std::make_shared<CLBufferMemoryRegion>(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.total_size());
+
+    // Negative case : Import empty memory
+    CLTensor t1;
+    t1.allocator()->init(info);
+    ARM_COMPUTE_EXPECT(!bool(t1.allocator()->import_memory(CLMemory())), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(t1.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Negative case : Import memory to a tensor that is memory managed
+    CLTensor      t2;
+    CLMemoryGroup mg;
+    t2.allocator()->set_associated_memory_group(&mg);
+    ARM_COMPUTE_EXPECT(!bool(t2.allocator()->import_memory(CLMemory(buf))), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(t2.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Positive case : Set managed pointer
+    CLTensor t3;
+    t3.allocator()->init(info);
+    ARM_COMPUTE_EXPECT(bool(t3.allocator()->import_memory(CLMemory(buf))), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(!t3.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(t3.cl_buffer().get() == buf->cl_data().get(), framework::LogLevel::ERRORS);
+    t3.allocator()->free();
+    ARM_COMPUTE_EXPECT(t3.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(t3.buffer() == nullptr, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/UNIT/Tuner.cpp b/tests/validation/CL/UNIT/Tuner.cpp
new file mode 100644
index 0000000..26d21b5
--- /dev/null
+++ b/tests/validation/CL/UNIT/Tuner.cpp

@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
+#include "support/ToolchainSupport.h"
+#include "tests/Utils.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(UNIT)
+TEST_SUITE(Tuner)
+
+/** Validates static tuning of Bifrost tuner */
+TEST_CASE(BifrostTunerSimple, framework::DatasetMode::ALL)
+{
+    // Create tuner
+    tuners::BifrostTuner tuner;
+
+    // Create tensors
+    auto src     = create_tensor<CLTensor>(TensorShape(13U, 13U, 16U), DataType::F32);
+    auto weights = create_tensor<CLTensor>(TensorShape(3U, 3U, 16U, 3U), DataType::F32);
+    auto bias    = create_tensor<CLTensor>(TensorShape(3U), DataType::F32);
+    auto dst     = create_tensor<CLTensor>(TensorShape(13U, 13U, 3U), DataType::F32);
+
+    // Create kernel
+    cl::NDRange                    fake_lws(2000);
+    CLDirectConvolutionLayerKernel conv;
+    conv.set_target(GPUTarget::G72);
+
+    // Hard-wire lws to kernel and validate lws
+    conv.set_lws_hint(fake_lws);
+    ARM_COMPUTE_EXPECT(conv.lws_hint()[0] == 2000, framework::LogLevel::ERRORS);
+
+    // Configure
+    conv.configure(&src, &weights, &bias, &dst, PadStrideInfo(1, 1, 1, 1));
+    ARM_COMPUTE_EXPECT(conv.lws_hint()[0] == 2000, framework::LogLevel::ERRORS);
+
+    // Tune kernel and validate
+    tuner.tune_kernel_static(conv);
+    ARM_COMPUTE_EXPECT(conv.lws_hint()[0] != 2000, framework::LogLevel::ERRORS);
+
+    // Clear tuner
+    CLScheduler::get().default_init();
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/WidthConcatenateLayer.cpp b/tests/validation/CL/WidthConcatenateLayer.cpp
new file mode 100644
index 0000000..0ff95df
--- /dev/null
+++ b/tests/validation/CL/WidthConcatenateLayer.cpp

@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/WidthConcatenateLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(WidthConcatenateLayer)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+              framework::dataset::make("InputInfo1", {  TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching data type input/output
+                                                        TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching y dimension
+                                                        TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching total width
+                                                        TensorInfo(TensorShape(16U, 27U, 5U), 1, DataType::F32, 0)
+              }),
+              framework::dataset::make("InputInfo2", {  TensorInfo(TensorShape(24U, 27U, 4U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(52U, 27U, 5U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(52U, 27U, 5U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(16U, 27U, 5U), 1, DataType::F32, 0)
+              })),
+              framework::dataset::make("OutputInfo", {  TensorInfo(TensorShape(47U, 27U, 5U), 1, DataType::F16, 0),
+                                                        TensorInfo(TensorShape(75U, 12U, 5U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(11U, 27U, 5U), 1, DataType::F32, 0),
+                                                        TensorInfo(TensorShape(32U, 27U, 5U), 1, DataType::F32, 0)
+              })),
+              framework::dataset::make("Expected", { false, false, false, true })),
+              input_info1, input_info2, output_info,expected)
+{
+    std::vector<TensorInfo> inputs_vector_info;
+    inputs_vector_info.emplace_back(std::move(input_info1));
+    inputs_vector_info.emplace_back(std::move(input_info2));
+
+    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    for(auto &input : inputs_vector_info)
+    {
+        inputs_vector_info_raw.emplace_back(&input);
+    }
+
+    bool is_valid = bool(CLWidthConcatenateLayer::validate(inputs_vector_info_raw,
+                                                           &output_info.clone()->set_is_resizable(false)));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+TEST_CASE(Configuration, framework::DatasetMode::ALL)
+{
+    // Create tensors
+    CLTensor src1 = create_tensor<CLTensor>(TensorShape(128U, 32U, 32U), DataType::F32, 1);
+    CLTensor src2 = create_tensor<CLTensor>(TensorShape(32U, 32U, 32U), DataType::F32, 1);
+    CLTensor src3 = create_tensor<CLTensor>(TensorShape(15U, 32U, 32U), DataType::F32, 1);
+    CLTensor dst;
+
+    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(src3.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    CLWidthConcatenateLayer concat_layer;
+
+    concat_layer.configure({ &src1, &src2, &src3 }, &dst);
+}
+
+template <typename T>
+using CLWidthConcatenateLayerFixture = WidthConcatenateLayerValidationFixture<CLTensor, ICLTensor, CLAccessor, CLWidthConcatenateLayer, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWidthConcatenateLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWidthConcatenateLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
+                                                                                                                DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWidthConcatenateLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWidthConcatenateLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::WidthConcatenateLayerShapes(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QS8)
+FIXTURE_DATA_TEST_CASE(RunTiny, CLWidthConcatenateLayerFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Tiny2DShapes(),
+                                                                                                                   framework::dataset::make("DataType",
+                                                                                                                           DataType::QS8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWidthConcatenateLayerFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::WidthConcatenateLayerShapes(),
+                                                                                                                  framework::dataset::make("DataType",
+                                                                                                                          DataType::QS8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(QS16)
+FIXTURE_DATA_TEST_CASE(RunTiny, CLWidthConcatenateLayerFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Tiny2DShapes(),
+                                                                                                                    framework::dataset::make("DataType",
+                                                                                                                            DataType::QS16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWidthConcatenateLayerFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(datasets::WidthConcatenateLayerShapes(),
+                                                                                                                   framework::dataset::make("DataType",
+                                                                                                                           DataType::QS16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
new file mode 100644
index 0000000..a61dd3f
--- /dev/null
+++ b/tests/validation/CL/Winograd.cpp

@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/CL/Helper.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/LargeConvolutionLayerDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/SmallConvolutionLayerDataset.h"
+#include "tests/datasets/WinogradInputTransformDataset.h"
+#include "tests/datasets/WinogradOutputTransformDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/WinogradConvolutionLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr AbsoluteTolerance<float> tolerance_f32(0.001f);
+constexpr AbsoluteTolerance<float> tolerance_convolution_layer_f32(0.1f);
+} // namespace
+
+using namespace arm_compute::misc::shape_calculator;
+
+TEST_SUITE(CL)
+TEST_SUITE(Winograd)
+
+TEST_SUITE(InputTransform)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+                                                framework::dataset::make("InputInfo",{
+                                                                                        TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F16),     // F16 not supported
+                                                                                        TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::QASYMM8), // QASYMM8 not supported
+                                                                                        TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32),     // Kernel size not supported
+                                                                                        TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32),     // Strides not supported
+                                                                                        TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32),         // Padding needed
+                                                                                        TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32),     // Padding needed
+                                                                                        TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32)         // Padding needed
+                                                                                    }),
+                                                framework::dataset::make("OutputInfo", {
+                                                                                        TensorInfo(TensorShape(5U, 5U, 16U, 3U), 1, DataType::F16),
+                                                                                        TensorInfo(TensorShape(5U, 5U, 16U, 3U), 1, DataType::QASYMM8),
+                                                                                        TensorInfo(TensorShape(5U, 5U, 16U, 3U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(5U, 1U, 16U, 3U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 442U, 16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(7U, 320U, 16U, 3U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(37U, 304U, 16U), 1, DataType::F32)
+                                                                                    })),
+                                                framework::dataset::make("WinogradInfo", {
+                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(53U, 21U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(53U, 21U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(53U, 21U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(53U, 21U), PadStrideInfo(2, 1, 1, 1), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(53U, 33U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(34U, 42U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2, 2), Size2D(3, 3), Size2D(31U, 37U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)
+                                                                                    })),
+                                                framework::dataset::make("Expected", { false, false, false, false, false, false, false })),
+                                            input_info, output_info, winograd_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CLWinogradInputTransform::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradInputTransformFixture = WinogradInputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradInputTransform, float>;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallWinogradInputTransformDataset(), datasets::LargeWinogradInputTransformDataset()),
+                                                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                   framework::dataset::make("DataType", { DataType::F32 })),
+               shape_in, winograd_info, data_layout, data_type)
+{
+    TensorShape shape_out = compute_winograd_input_transform_shape(TensorInfo(shape_in, 1, data_type), winograd_info);
+
+    // Create tensors
+    CLTensor in  = create_tensor<CLTensor>(shape_in, data_type, 1, 0, QuantizationInfo(), data_layout);
+    CLTensor out = create_tensor<CLTensor>(shape_out, data_type);
+
+    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    CLWinogradInputTransform winograd_input_transform;
+
+    // Configure the function
+    winograd_input_transform.configure(&in, &out, winograd_info);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradInputTransformFixture, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallWinogradInputTransformDataset(),
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                             framework::dataset::make("DataType", { DataType::F32 })))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradInputTransformFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeWinogradInputTransformDataset(),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                           framework::dataset::make("DataType", { DataType::F32 })))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // InputTransform
+
+TEST_SUITE(FilterTransform)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+                                                framework::dataset::make("InputInfo",{
+                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 3U), 1, DataType::F16),     // F16 not supported
+                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 3U), 1, DataType::QASYMM8), // QASYMM8 not supported
+                                                                                        TensorInfo(TensorShape(5U, 5U, 5U, 3U), 1, DataType::F32),     // Kernel size not supported
+                                                                                        TensorInfo(TensorShape(3U, 3U), 1, DataType::F32),             // Output tile not supported
+                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 3U), 1, DataType::F32),     // valid
+                                                                                        TensorInfo(TensorShape(3U, 3U, 37U, 2U), 1, DataType::F32),    // valid
+                                                                                        TensorInfo(TensorShape(3U, 3U, 37U, 22U), 1, DataType::F32)    // valid
+                                                                                    }),
+                                                framework::dataset::make("OutputInfo", {
+                                                                                        TensorInfo(TensorShape(3U, 5U, 16U), 1, DataType::F16),
+                                                                                        TensorInfo(TensorShape(3U, 5U, 16U), 1, DataType::QASYMM8),
+                                                                                        TensorInfo(TensorShape(3U, 5U, 16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(1U, 1U, 16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(3U, 5U, 16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(2U, 37U, 16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(22U, 37U, 36U), 1, DataType::F32)
+                                                                                    })),
+                                                framework::dataset::make("WinogradInfo", {
+                                                                                          WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
+                                                                                          WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
+                                                                                          WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
+                                                                                          WinogradInfo(Size2D(3U, 3U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
+                                                                                          WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
+                                                                                          WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ ),
+                                                                                          WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW  /* Not needed */ )
+                                                                                         })),
+                                                framework::dataset::make("Expected", { false, false, false, false, true, true, true })),
+                                            input_info, output_info, winograd_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CLWinogradFilterTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradFilterTransform        = CLSynthetizeFunctionWithZeroConstantBorder<CLWinogradFilterTransformKernel, 0>;
+using CLWinogradFilterTransformFixture = WinogradFilterTransformValidationFixture<CLTensor, CLAccessor, CLWinogradFilterTransform, float>;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::concat(datasets::Small3x3Shapes(), datasets::Large3x3Shapes()),
+                                                                                   framework::dataset::make("OutputTile", { Size2D(2U, 2U), Size2D(4U, 4U) })),
+                                                                           framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                   framework::dataset::make("DataType", { DataType::F32 })),
+               shape_a, output_tile, data_layout, data_type)
+{
+    WinogradInfo winograd_info(output_tile, Size2D(shape_a[0], shape_a[1]), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW /* Not needed */);
+
+    TensorShape shape_b = compute_winograd_filter_transform_shape(TensorInfo(shape_a, 1, data_type), winograd_info);
+
+    // Create tensors
+    CLTensor a = create_tensor<CLTensor>(shape_a, data_type, 1, 0, QuantizationInfo(), data_layout);
+    CLTensor b = create_tensor<CLTensor>(shape_b, data_type, 1, 0, QuantizationInfo(), data_layout);
+
+    ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    CLWinogradFilterTransform winograd_filter_transform;
+    winograd_filter_transform.configure(&a, &b, winograd_info);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradFilterTransformFixture, framework::DatasetMode::ALL,
+                       combine(combine(framework::dataset::concat(framework::dataset::concat(combine(datasets::Small3x3Shapes(), framework::dataset::make("OutputTile", Size2D(2U, 2U))), combine(datasets::Small3x3Shapes(),
+                                                                                             framework::dataset::make("OutputTile", Size2D(4U, 4U)))),
+                                                                  combine(datasets::Small5x5Shapes(), framework::dataset::make("OutputTile", Size2D(4U, 4U)))),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                               framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixture, framework::DatasetMode::NIGHTLY,
+                       combine(combine(framework::dataset::concat(framework::dataset::concat(combine(datasets::Large3x3Shapes(), framework::dataset::make("OutputTile", Size2D(2U, 2U))), combine(datasets::Large3x3Shapes(),
+                                                                                             framework::dataset::make("OutputTile", Size2D(4U, 4U)))),
+                                                                  combine(datasets::Large5x5Shapes(), framework::dataset::make("OutputTile", Size2D(4U, 4U)))),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                               framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // FilterTransform
+
+TEST_SUITE(OutputTransform)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
+                                                framework::dataset::make("InputInfo",{
+                                                                                        TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F16),      // F16 not supported
+                                                                                        TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::QASYMM8),  // QASYMM8 not supported
+                                                                                        TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F32),      // Kernel size not supported
+                                                                                        TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F32),      // Valid
+                                                                                        TensorInfo(TensorShape(13U, 108U, 16U, 4U), 1, DataType::F32),      // Padding needed
+                                                                                        TensorInfo(TensorShape(7U, 20U, 16U, 7U), 1, DataType::F32),        // Valid
+                                                                                        TensorInfo(TensorShape(7U, 20U, 16U, 7U), 1, DataType::F32),        // Wrong WinogradInfo
+                                                                                        TensorInfo(TensorShape(7U, 256U, 36U, 3U), 1, DataType::F32),       // Valid
+                                                                                        TensorInfo(TensorShape(7U, 256U, 16U, 3U), 1, DataType::F32)        // Wrong number of batches
+                                                                                    }),
+                                                framework::dataset::make("BiasInfo", {
+                                                                                        TensorInfo(TensorShape(512U), 1, DataType::F16),
+                                                                                        TensorInfo(TensorShape(512U), 1, DataType::QASYMM8),
+                                                                                        TensorInfo(TensorShape(512U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(512U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(13U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(7U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(7U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(7U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(7U), 1, DataType::F32)
+                                                                                    })),
+                                                framework::dataset::make("OutputInfo", {
+                                                                                        TensorInfo(TensorShape(14U, 14U, 512U, 5U), 1, DataType::F16),
+                                                                                        TensorInfo(TensorShape(14U, 14U, 512U, 5U), 1, DataType::QASYMM8),
+                                                                                        TensorInfo(TensorShape(14U, 14U, 512U, 5U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(14U, 14U, 512U, 5U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(17U, 23U, 13U, 4U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(8U, 10U, 7U, 7U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(7U, 9U, 7U, 7U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(64U, 64U, 7U, 3U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(64U, 64U, 7U, 3U), 1, DataType::F32)
+                                                                                    })),
+                                                framework::dataset::make("WinogradInfo", {
+                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(5U, 5U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(17U, 23U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(2U, 3U), Size2D(3U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(64U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
+                                                                                        WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(64U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)
+                                                                                    })),
+                                                framework::dataset::make("Expected", { false, false, false, true, false, true, false, true, false })),
+                                            input_info, bias_info, output_info, winograd_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CLWinogradOutputTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradOutputTransform        = CLSynthetizeFunctionWithZeroConstantBorder<CLWinogradOutputTransformKernel, 0>;
+using CLWinogradOutputTransformFixture = WinogradOutputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradOutputTransform, float>;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallWinogradOutputTransformDataset(), datasets::LargeWinogradOutputTransformDataset()),
+                                                                   framework::dataset::make("DataType", { DataType::F32 })),
+               shape_a, winograd_info, data_type)
+{
+    TensorShape shape_b = compute_winograd_output_transform_shape(TensorInfo(shape_a, 1, data_type), winograd_info);
+
+    // Create tensors
+    CLTensor a = create_tensor<CLTensor>(shape_a, data_type);
+    CLTensor b = create_tensor<CLTensor>(shape_b, data_type);
+
+    ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    CLWinogradOutputTransform winograd_output_transform;
+    winograd_output_transform.configure(&a, nullptr, &b, winograd_info);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixture, framework::DatasetMode::ALL, combine(datasets::SmallWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // OutputTransform
+
+TEST_SUITE(ConvolutionLayer)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+                                                framework::dataset::make("InputInfo", {
+                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16),     // FP16 not supported
+                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),     // Datatype mismatch
+                                                                                        TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Stride y not supported
+                                                                                        TensorInfo(TensorShape(16U, 16U, 8U), 1, DataType::F32),     // Padding needed
+                                                                                        TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)  // Kernel size not supported
+                                                                                      }),
+                                                framework::dataset::make("WeightsInfo", {
+                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::QASYMM8),
+                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(3U, 3U, 8U, 16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
+                                                                                        })),
+                                                framework::dataset::make("BiasesInfo", {
+                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(21U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(16U), 1, DataType::F32)
+                                                                                       })),
+                                                framework::dataset::make("OutputInfo", {
+                                                                                        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(16U, 16U, 16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
+                                                                                       })),
+                                                framework::dataset::make("ConvInfo", {
+                                                                                        PadStrideInfo(1, 1, 1, 1),
+                                                                                        PadStrideInfo(1, 1, 1, 1),
+                                                                                        PadStrideInfo(1, 2, 0, 0),
+                                                                                        PadStrideInfo(1, 1, 1, 1),
+                                                                                        PadStrideInfo(1, 1, 1, 0)
+                                                                                                                 })),
+                                                framework::dataset::make("Expected", { false, false, false, false, false })),
+               input_info, weights_info, bias_info, output_info, conv_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CLWinogradConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradConvolutionLayerFastMathFixture = WinogradConvolutionLayerFastMathValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float>;
+TEST_SUITE(Conv3x3)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
+}
+TEST_SUITE_END() // Conv3x3
+
+TEST_SUITE(Conv5x5)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(datasets::SmallWinogradConvolutionLayer5x5Dataset(),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFastMathFixture, framework::DatasetMode::NIGHTLY,
+                       combine(combine(datasets::LargeWinogradConvolutionLayer5x5Dataset(),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("ActivationLayerInfo", { ActivationLayerInfo() })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_convolution_layer_f32);
+}
+TEST_SUITE_END() // Conv5x5
+
+TEST_SUITE_END() // ConvolutionLayer
+
+TEST_SUITE_END() // Winograd
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/FixedPoint.h b/tests/validation/FixedPoint.h
index 6699aee..a573515 100644
--- a/tests/validation/FixedPoint.h
+++ b/tests/validation/FixedPoint.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,15 +54,26 @@
 // Promote types
 // *INDENT-OFF*
 // clang-format off
+/** Promote a type */
 template <typename T> struct promote { };
-template <> struct promote<uint8_t> { using type = uint16_t; };
-template <> struct promote<int8_t> { using type = int16_t; };
-template <> struct promote<uint16_t> { using type = uint32_t; };
-template <> struct promote<int16_t> { using type = int32_t; };
-template <> struct promote<uint32_t> { using type = uint64_t; };
-template <> struct promote<int32_t> { using type = int64_t; };
-template <> struct promote<uint64_t> { using type = uint64_t; };
-template <> struct promote<int64_t> { using type = int64_t; };
+/** Promote uint8_t to uint16_t */
+template <> struct promote<uint8_t> { using type = uint16_t; /**< Promoted type */ };
+/** Promote int8_t to int16_t */
+template <> struct promote<int8_t> { using type = int16_t; /**< Promoted type */ };
+/** Promote uint16_t to uint32_t */
+template <> struct promote<uint16_t> { using type = uint32_t; /**< Promoted type */ };
+/** Promote int16_t to int32_t */
+template <> struct promote<int16_t> { using type = int32_t; /**< Promoted type */ };
+/** Promote uint32_t to uint64_t */
+template <> struct promote<uint32_t> { using type = uint64_t; /**< Promoted type */ };
+/** Promote int32_t to int64_t */
+template <> struct promote<int32_t> { using type = int64_t; /**< Promoted type */ };
+/** Promote float to float */
+template <> struct promote<float> { using type = float; /**< Promoted type */ };
+/** Promote half to half */
+template <> struct promote<half> { using type = half; /**< Promoted type */ };
+
+/** Get promoted type */
 template <typename T>
 using promote_t = typename promote<T>::type;
 // clang-format on
@@ -281,6 +292,7 @@
     return __builtin_clz(value) - (32 - std::numeric_limits<unsigned_T>::digits);
 }
 
+/** Constant expressions */
 template <typename T>
 struct constant_expr
 {
@@ -362,6 +374,7 @@
         return static_cast<T>(std::min<U>(std::max<U>(val, static_cast<U>(std::numeric_limits<T>::min())), static_cast<U>(std::numeric_limits<T>::max())));
     }
 };
+/** Functions */
 struct functions
 {
     /** Output stream operator

diff --git a/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp b/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp
index d817fc0..d22f1e9 100644
--- a/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp

@@ -32,6 +32,7 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/BatchNormalizationLayerFixture.h"
 
@@ -59,15 +60,25 @@
 template <typename T>
 using GCBatchNormalizationLayerFixture = BatchNormalizationLayerValidationFixture<GCTensor, GCAccessor, GCBatchNormalizationLayer, T>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::RandomBatchNormalizationLayerDataset(), framework::dataset::make("DataType", { DataType::F32 })),
-               shape0, shape1, epsilon, dt)
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                                   combine(framework::dataset::make("UseBeta", { false, true }),
+                                                                                           framework::dataset::make("UseGamma", { false, true }))),
+                                                                           framework::dataset::make("DataType", { DataType::F32 })),
+                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+               shape0, shape1, epsilon, use_beta, use_gamma, dt, data_layout)
 {
     // Set fixed point position data type allowed
     int fixed_point_position = (arm_compute::is_data_type_fixed_point(dt)) ? 3 : 0;
 
+    TensorShape src_dst_shapes = shape0;
+    if(data_layout == DataLayout::NHWC)
+    {
+        permute(src_dst_shapes, PermutationVector(2U, 0U, 1U));
+    }
+
     // Create tensors
-    GCTensor src   = create_tensor<GCTensor>(shape0, dt, 1, fixed_point_position);
-    GCTensor dst   = create_tensor<GCTensor>(shape0, dt, 1, fixed_point_position);
+    GCTensor src   = create_tensor<GCTensor>(src_dst_shapes, dt, 1, fixed_point_position, QuantizationInfo(), data_layout);
+    GCTensor dst   = create_tensor<GCTensor>(src_dst_shapes, dt, 1, fixed_point_position, QuantizationInfo(), data_layout);
     GCTensor mean  = create_tensor<GCTensor>(shape1, dt, 1, fixed_point_position);
     GCTensor var   = create_tensor<GCTensor>(shape1, dt, 1, fixed_point_position);
     GCTensor beta  = create_tensor<GCTensor>(shape1, dt, 1, fixed_point_position);
@@ -75,18 +86,23 @@
 
     // Create and Configure function
     GCBatchNormalizationLayer norm;
-    norm.configure(&src, &dst, &mean, &var, &beta, &gamma, epsilon);
+    GCTensor                 *beta_ptr  = use_beta ? &beta : nullptr;
+    GCTensor                 *gamma_ptr = use_gamma ? &gamma : nullptr;
+    norm.configure(&src, &dst, &mean, &var, beta_ptr, gamma_ptr, epsilon);
 
     // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape0);
+    const ValidRegion valid_region = shape_to_valid_region(src_dst_shapes);
     validate(dst.info()->valid_region(), valid_region);
 }
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(Random, GCBatchNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+FIXTURE_DATA_TEST_CASE(Random, GCBatchNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                                                                  combine(framework::dataset::make("UseBeta", { false, true }),
+                                                                                                                          framework::dataset::make("UseGamma", { false, true }))),
                                                                                                                   act_infos),
-                                                                                                                  framework::dataset::make("DataType", DataType::F16)))
+                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16, 0);
@@ -94,9 +110,12 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(Random, GCBatchNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+FIXTURE_DATA_TEST_CASE(Random, GCBatchNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                                                                   combine(framework::dataset::make("UseBeta", { false, true }),
+                                                                                                                           framework::dataset::make("UseGamma", { false, true }))),
                                                                                                                    act_infos),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)))
+                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f, 0);

diff --git a/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp
index ddb5976..722dd68 100644
--- a/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp

@@ -45,21 +45,30 @@
 namespace
 {
 RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+RelativeTolerance<float>            tolerance_f32(0.00001f);              /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 constexpr float                     tolerance_num = 0.07f;                /**< Tolerance number */
 
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
 {
     DataType::F16,
-    // DataType::F32,
+    DataType::F32,
+});
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f)
 });
 } // namespace
 
 TEST_SUITE(GC)
 TEST_SUITE(ConvolutionLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallConvolutionLayerDataset(), datasets::LargeConvolutionLayerDataset()), CNNDataTypes),
-               input_shape, weights_shape, bias_shape, output_shape, info, data_type)
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallConvolutionLayerDataset(), datasets::LargeConvolutionLayerDataset()),
+                                                                           CNNDataTypes),
+                                                                   ActivationFunctionsDataset),
+               input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type, act_info)
 {
     // Set fixed point position data type allowed
     int fixed_point_position = is_data_type_fixed_point(data_type) ? 3 : 0;
@@ -82,7 +91,7 @@
 
     // Create and configure function
     GCConvolutionLayer conv;
-    conv.configure(&src, &weights, &bias, &dst, info);
+    conv.configure(&src, &weights, &bias, &dst, info, WeightsInfo(), dilation, act_info);
 
     // Validate valid region
     const ValidRegion src_valid_region     = shape_to_valid_region(input_shape);
@@ -105,23 +114,52 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                     framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                             framework::dataset::make("DataType",
-                                                                                                                     DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, GCConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                     framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                     framework::dataset::make("DataType",
+                                                                                                                             DataType::F16)),
+                                                                                                                     framework::dataset::make("DataLayout",
+                                                                                                                             DataLayout::NCHW)),
+                                                                                                             ActivationFunctionsDataset))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, GCConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeConvolutionLayerDataset(),
-                                                                                                                   framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                           framework::dataset::make("DataType",
-                                                                                                                   DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, GCConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeConvolutionLayerDataset(),
+                                                                                                                   framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                   framework::dataset::make("DataType",
+                                                                                                                           DataType::F16)),
+                                                                                                                   framework::dataset::make("DataLayout",
+                                                                                                                           DataLayout::NCHW)),
+                                                                                                           ActivationFunctionsDataset))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 TEST_SUITE_END()
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, GCConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                      framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                      framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                      framework::dataset::make("DataLayout",
+                                                                                                                              DataLayout::NCHW)),
+                                                                                                              ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(GCAccessor(_target), _reference, tolerance_f32, tolerance_num);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, GCConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeConvolutionLayerDataset(),
+                                                                                                                    framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                    framework::dataset::make("DataLayout",
+                                                                                                                            DataLayout::NCHW)),
+                                                                                                            ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(GCAccessor(_target), _reference, tolerance_f32, tolerance_num);
+}
+TEST_SUITE_END()
 TEST_SUITE_END()
 
 TEST_SUITE_END()

diff --git a/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
index cacf696..22b1e08 100644
--- a/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,8 @@
 {
 RelativeTolerance<half> tolerance_fp16(half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr float         tolerance_num = 0.07f;     /**< Tolerance number */
+
+const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 3 });
 } // namespace
 
 TEST_SUITE(GC)
@@ -55,15 +57,19 @@
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                   depth_multipliers),
                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                           DataType::F16)))
+                                                                                                                           DataType::F16)),
+                                                                                                                   framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(GCAccessor(_target), _reference, tolerance_fp16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, GCDepthwiseConvolutionLayerFixture3x3<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                       depth_multipliers),
                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                               DataType::F16)))
+                                                                                                                               DataType::F16)),
+                                                                                                                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(GCAccessor(_target), _reference, tolerance_fp16, tolerance_num);
 }

diff --git a/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp
index eb3d307..2ff6678 100644
--- a/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,6 +68,13 @@
                                                                  combine(framework::dataset::make("PadY", 0, 2),
                                                                          framework::dataset::make("KernelSize", { 3, 5 })))),
                                                   framework::dataset::make("NumKernels", { 1, 4, 8, 16 })))));
+/** Activation function Dataset*/
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+});
 } // namespace
 
 TEST_SUITE(GC)
@@ -78,7 +85,9 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(Run, GCDirectConvolutionLayerFixture<half_float::half>, framework::DatasetMode::ALL, combine(data, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(Run, GCDirectConvolutionLayerFixture<half_float::half>, framework::DatasetMode::ALL, combine(combine(combine(data, framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                    ActivationFunctionsDataset),
+                                                                                                                    framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_fp16, tolerance_num);
@@ -86,7 +95,9 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(Run, GCDirectConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(data, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(Run, GCDirectConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(data, framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                 ActivationFunctionsDataset),
+                                                                                                         framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_fp32);

diff --git a/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp b/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp
index 88372ff..162f189 100644
--- a/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,7 +59,9 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunGlobalPooling, GCGlobalPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunGlobalPooling, GCGlobalPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType",
+                                                                                                                  DataType::F32)),
+                                                                                                                  framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
@@ -67,8 +69,9 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunGlobalPooling, GCGlobalPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType",
-                                                                                                                 DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunGlobalPooling, GCGlobalPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType",
+                                                                                                                 DataType::F16)),
+                                                                                                                 framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16);

diff --git a/tests/validation/GLES_COMPUTE/PoolingLayer.cpp b/tests/validation/GLES_COMPUTE/PoolingLayer.cpp
index 1496cee..ac1bd72 100644
--- a/tests/validation/GLES_COMPUTE/PoolingLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/PoolingLayer.cpp

@@ -87,14 +87,17 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType",
-                                                                                                    DataType::F32))))
+FIXTURE_DATA_TEST_CASE(RunSmall, GCPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType",
+                                                                                                            DataType::F32))),
+                                                                                                    framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, GCPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType",
-                                                                                                        DataType::F32))))
+FIXTURE_DATA_TEST_CASE(RunLarge, GCPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                                framework::dataset::make("DataType",
+                                                                                                                        DataType::F32))),
+                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
@@ -102,14 +105,16 @@
 TEST_SUITE_END()
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP,
-                                                                                                   framework::dataset::make("DataType", DataType::F16))))
+FIXTURE_DATA_TEST_CASE(RunSmall, GCPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                           framework::dataset::make("DataType", DataType::F16))),
+                                                                                                   framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, GCPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
-                                                                                                       framework::dataset::make("DataType", DataType::F16))))
+FIXTURE_DATA_TEST_CASE(RunLarge, GCPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                               framework::dataset::make("DataType", DataType::F16))),
+                                                                                                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16);

diff --git a/tests/validation/GLES_COMPUTE/Scale.cpp b/tests/validation/GLES_COMPUTE/Scale.cpp
index 92c4a89..4bfa08f 100644
--- a/tests/validation/GLES_COMPUTE/Scale.cpp
+++ b/tests/validation/GLES_COMPUTE/Scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,7 +90,7 @@
     const BorderSize border_size(border_mode == BorderMode::UNDEFINED ? 0 : 1);
 
     // Validate valid region
-    const ValidRegion dst_valid_region = calculate_valid_region_scale(*(src.info()), shape_scaled, policy, border_size, (border_mode == BorderMode::UNDEFINED));
+    const ValidRegion dst_valid_region = calculate_valid_region_scale(*(src.info()), shape_scaled, policy, sampling_policy, (border_mode == BorderMode::UNDEFINED));
     validate(dst.info()->valid_region(), dst_valid_region);
 
     // Validate padding
@@ -108,27 +108,30 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCScaleFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
+FIXTURE_DATA_TEST_CASE(RunSmall, GCScaleFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                                    DataType::F16)),
+                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                             framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR })),
                                                                                                     datasets::BorderModes()),
                                                                                             datasets::SamplingPolicies()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(GCAccessor(_target), _reference, valid_region, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, GCScaleFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunLarge, GCScaleFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
                                                                                                                         DataType::F16)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW })),
                                                                                                                 framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR })),
                                                                                                         datasets::BorderModes()),
                                                                                                 datasets::SamplingPolicies()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(GCAccessor(_target), _reference, valid_region, tolerance_f16);

diff --git a/tests/validation/GLES_COMPUTE/UNIT/MemoryManager.cpp b/tests/validation/GLES_COMPUTE/UNIT/MemoryManager.cpp
new file mode 100644
index 0000000..8f59a05
--- /dev/null
+++ b/tests/validation/GLES_COMPUTE/UNIT/MemoryManager.cpp

@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryGroup.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
+#include "support/ToolchainSupport.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/GLES_COMPUTE/GCAccessor.h"
+#include "tests/Globals.h"
+#include "tests/Utils.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/UNIT/MemoryManagerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+RelativeTolerance<float> tolerance_f32(0.05f);
+} // namespace
+
+TEST_SUITE(GC)
+TEST_SUITE(UNIT)
+TEST_SUITE(MemoryManager)
+
+// Setting BlobMemoryManagerSimpleWithinFunctionLevel test
+using GCBlobMemoryManagerSimpleWithinFunctionLevelFixture = BlobMemoryManagerSimpleTestCaseFixture<GCTensor,
+      GCAccessor,
+      GCBufferAllocator,
+      GCFullyConnectedLayer>;
+FIXTURE_TEST_CASE(BlobMemoryManagerSimpleWithinFunctionLevel,
+                  GCBlobMemoryManagerSimpleWithinFunctionLevelFixture,
+                  framework::DatasetMode::ALL)
+{
+    // Validate output
+    validate(GCAccessor(_target), _reference, tolerance_f32);
+}
+
+// Setting BlobMemoryManagerReconfigure test
+using GCBlobMemoryManagerReconfigureFixture = BlobMemoryManagerReconfigureTestCaseFixture<GCTensor,
+      GCAccessor,
+      GCBufferAllocator,
+      GCFullyConnectedLayer>;
+FIXTURE_TEST_CASE(BlobMemoryManagerReconfigure,
+                  GCBlobMemoryManagerReconfigureFixture,
+                  framework::DatasetMode::ALL)
+{
+    // Validate output
+    validate(GCAccessor(_target), _reference, tolerance_f32);
+}
+
+// Setting BlobMemoryManagerReconfigure2 test
+using GCBlobMemoryManagerReconfigure2Fixture = BlobMemoryManagerReconfigure2TestCaseFixture<GCTensor,
+      GCAccessor,
+      GCBufferAllocator,
+      GCFullyConnectedLayer,
+      GCSoftmaxLayer>;
+FIXTURE_TEST_CASE(BlobMemoryManagerReconfigure2,
+                  GCBlobMemoryManagerReconfigure2Fixture,
+                  framework::DatasetMode::ALL)
+{
+    // Validate output
+    validate(GCAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index 313b059..25dc6c5 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,9 @@
  */
 #include "tests/validation/Helpers.h"
 
+#include <algorithm>
+#include <cmath>
+
 namespace arm_compute
 {
 namespace test
@@ -95,12 +98,27 @@
     return out_shape;
 }
 
+TensorShape calculate_width_concatenate_shape(const std::vector<TensorShape> &input_shapes)
+{
+    ARM_COMPUTE_ERROR_ON(input_shapes.empty());
+
+    TensorShape out_shape = input_shapes[0];
+
+    int width = std::accumulate(input_shapes.begin(), input_shapes.end(), 0, [](int sum, const TensorShape & shape)
+    {
+        return sum + shape.x();
+    });
+    out_shape.set(0, width);
+
+    return out_shape;
+}
+
 HarrisCornersParameters harris_corners_parameters()
 {
     HarrisCornersParameters params;
 
     std::mt19937                           gen(library->seed());
-    std::uniform_real_distribution<float>  threshold_dist(0.f, 0.01f);
+    std::uniform_real_distribution<float>  threshold_dist(0.f, 0.001f);
     std::uniform_real_distribution<float>  sensitivity(0.04f, 0.15f);
     std::uniform_real_distribution<float>  euclidean_distance(0.f, 30.f);
     std::uniform_int_distribution<uint8_t> int_dist(0, 255);
@@ -116,7 +134,8 @@
 SimpleTensor<float> convert_from_asymmetric(const SimpleTensor<uint8_t> &src)
 {
     const QuantizationInfo &quantization_info = src.quantization_info();
-    SimpleTensor<float>     dst{ src.shape(), DataType::F32, 1, 0 };
+    SimpleTensor<float>     dst{ src.shape(), DataType::F32, 1, 0, QuantizationInfo(), src.data_layout() };
+
     for(int i = 0; i < src.num_elements(); ++i)
     {
         dst[i] = quantization_info.dequantize(src[i]);
@@ -133,6 +152,111 @@
     }
     return dst;
 }
+
+void matrix_multiply(const SimpleTensor<float> &a, const SimpleTensor<float> &b, SimpleTensor<float> &out)
+{
+    ARM_COMPUTE_ERROR_ON(a.shape()[0] != b.shape()[1]);
+    ARM_COMPUTE_ERROR_ON(a.shape()[1] != out.shape()[1]);
+    ARM_COMPUTE_ERROR_ON(b.shape()[0] != out.shape()[0]);
+
+    const int M = a.shape()[1]; // Rows
+    const int N = b.shape()[0]; // Cols
+    const int K = b.shape()[1];
+
+    for(int y = 0; y < M; ++y)
+    {
+        for(int x = 0; x < N; ++x)
+        {
+            float acc = 0.0f;
+            for(int k = 0; k < K; ++k)
+            {
+                acc += a[y * K + k] * b[x + k * N];
+            }
+
+            out[x + y * N] = acc;
+        }
+    }
+}
+
+void transpose_matrix(const SimpleTensor<float> &in, SimpleTensor<float> &out)
+{
+    ARM_COMPUTE_ERROR_ON((in.shape()[0] != out.shape()[1]) || (in.shape()[1] != out.shape()[0]));
+
+    const int width  = in.shape()[0];
+    const int height = in.shape()[1];
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; ++x)
+        {
+            const float val = in[x + y * width];
+
+            out[x * height + y] = val;
+        }
+    }
+}
+
+template <typename T>
+void get_tile(const SimpleTensor<T> &in, SimpleTensor<T> &tile, const Coordinates &coord)
+{
+    ARM_COMPUTE_ERROR_ON(tile.shape().num_dimensions() != 2);
+
+    const int w_tile = tile.shape()[0];
+    const int h_tile = tile.shape()[1];
+
+    // Fill the tile with zeros
+    std::fill(tile.data() + 0, (tile.data() + (w_tile * h_tile)), static_cast<T>(0));
+
+    // Check if with the dimensions greater than 2 we could have out-of-bound reads
+    for(size_t d = 2; d < Coordinates::num_max_dimensions; ++d)
+    {
+        if(coord[d] < 0 || coord[d] >= static_cast<int>(in.shape()[d]))
+        {
+            ARM_COMPUTE_ERROR("coord[d] < 0 || coord[d] >= in.shape()[d] with d >= 2");
+        }
+    }
+
+    // Since we could have out-of-bound reads along the X and Y dimensions,
+    // we start calculating the input address with x = 0 and y = 0
+    Coordinates start_coord = coord;
+    start_coord[0]          = 0;
+    start_coord[1]          = 0;
+
+    // Get input and roi pointers
+    auto in_ptr  = static_cast<const T *>(in(start_coord));
+    auto roi_ptr = static_cast<T *>(tile.data());
+
+    const int x_in_start = std::max(0, coord[0]);
+    const int y_in_start = std::max(0, coord[1]);
+    const int x_in_end   = std::min(static_cast<int>(in.shape()[0]), coord[0] + w_tile);
+    const int y_in_end   = std::min(static_cast<int>(in.shape()[1]), coord[1] + h_tile);
+
+    // Number of elements to copy per row
+    const int n = x_in_end - x_in_start;
+
+    // Starting coordinates for the ROI
+    const int x_tile_start = coord[0] > 0 ? 0 : std::abs(coord[0]);
+    const int y_tile_start = coord[1] > 0 ? 0 : std::abs(coord[1]);
+
+    // Update input pointer
+    in_ptr += x_in_start;
+    in_ptr += (y_in_start * in.shape()[0]);
+
+    // Update ROI pointer
+    roi_ptr += x_tile_start;
+    roi_ptr += (y_tile_start * tile.shape()[0]);
+
+    for(int y = y_in_start; y < y_in_end; ++y)
+    {
+        // Copy per row
+        std::copy(in_ptr, in_ptr + n, roi_ptr);
+
+        in_ptr += in.shape()[0];
+        roi_ptr += tile.shape()[0];
+    }
+}
+
+template void get_tile(const SimpleTensor<float> &in, SimpleTensor<float> &roi, const Coordinates &coord);
 } // namespace validation
 } // namespace test
 } // namespace arm_compute

diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
old mode 100755
new mode 100644
index ba45968..d07803f
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -148,13 +148,21 @@
  */
 TensorShape calculate_depth_concatenate_shape(const std::vector<TensorShape> &input_shapes);
 
+/** Calculate output tensor shape give a vector of input tensor to concatenate
+ *
+ * @param[in] input_shapes Shapes of the tensors to concatenate across width.
+ *
+ * @return The shape of output concatenated tensor.
+ */
+TensorShape calculate_width_concatenate_shape(const std::vector<TensorShape> &input_shapes);
+
 /** Parameters of Harris Corners algorithm. */
 struct HarrisCornersParameters
 {
-    float   threshold{ 0.f };
-    float   sensitivity{ 0.f };
-    float   min_dist{ 0.f };
-    uint8_t constant_border_value{ 0 };
+    float   threshold{ 0.f };           /**< Threshold */
+    float   sensitivity{ 0.f };         /**< Sensitivity */
+    float   min_dist{ 0.f };            /**< Minimum distance */
+    uint8_t constant_border_value{ 0 }; /**< Border value */
 };
 
 /** Generate parameters for Harris Corners algorithm. */
@@ -232,6 +240,34 @@
  * @return Quantized tensor.
  */
 SimpleTensor<uint8_t> convert_to_asymmetric(const SimpleTensor<float> &src, const QuantizationInfo &quantization_info);
+
+/** Matrix multiply between 2 float simple tensors
+ *
+ * @param[in]  a   Input tensor A
+ * @param[in]  b   Input tensor B
+ * @param[out] out Output tensor
+ *
+ */
+void matrix_multiply(const SimpleTensor<float> &a, const SimpleTensor<float> &b, SimpleTensor<float> &out);
+
+/** Transpose matrix
+ *
+ * @param[in]  in  Input tensor
+ * @param[out] out Output tensor
+ *
+ */
+void transpose_matrix(const SimpleTensor<float> &in, SimpleTensor<float> &out);
+
+/** Get a 2D tile from a tensor
+ *
+ * @note In case of out-of-bound reads, the tile will be filled with zeros
+ *
+ * @param[in]  in    Input tensor
+ * @param[out] tile  Tile
+ * @param[in]  coord Coordinates
+ */
+template <typename T>
+void get_tile(const SimpleTensor<T> &in, SimpleTensor<T> &tile, const Coordinates &coord);
 } // namespace validation
 } // namespace test
 } // namespace arm_compute

diff --git a/tests/validation/NEON/BatchNormalizationLayer.cpp b/tests/validation/NEON/BatchNormalizationLayer.cpp
index 054ed27..53fd016 100644
--- a/tests/validation/NEON/BatchNormalizationLayer.cpp
+++ b/tests/validation/NEON/BatchNormalizationLayer.cpp

@@ -32,6 +32,7 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/BatchNormalizationLayerFixture.h"
 
@@ -63,15 +64,24 @@
 template <typename T>
 using NEBatchNormalizationLayerFixture = BatchNormalizationLayerValidationFixture<Tensor, Accessor, NEBatchNormalizationLayer, T>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::RandomBatchNormalizationLayerDataset(), framework::dataset::make("DataType", { DataType::QS8, DataType::QS16, DataType::F32 })),
-               shape0, shape1, epsilon, dt)
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                                   combine(framework::dataset::make("UseBeta", { false, true }), framework::dataset::make("UseGamma", { false, true }))),
+                                                                           framework::dataset::make("DataType", { DataType::QS8, DataType::QS16, DataType::F32 })),
+                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+               shape0, shape1, epsilon, use_beta, use_gamma, dt, data_layout)
 {
     // Set fixed point position data type allowed
     const int fixed_point_position = (arm_compute::is_data_type_fixed_point(dt)) ? 3 : 0;
 
+    TensorShape src_dst_shapes = shape0;
+    if(data_layout == DataLayout::NHWC)
+    {
+        permute(src_dst_shapes, PermutationVector(2U, 0U, 1U));
+    }
+
     // Create tensors
-    Tensor src   = create_tensor<Tensor>(shape0, dt, 1, fixed_point_position);
-    Tensor dst   = create_tensor<Tensor>(shape0, dt, 1, fixed_point_position);
+    Tensor src   = create_tensor<Tensor>(src_dst_shapes, dt, 1, fixed_point_position, QuantizationInfo(), data_layout);
+    Tensor dst   = create_tensor<Tensor>(src_dst_shapes, dt, 1, fixed_point_position, QuantizationInfo(), data_layout);
     Tensor mean  = create_tensor<Tensor>(shape1, dt, 1, fixed_point_position);
     Tensor var   = create_tensor<Tensor>(shape1, dt, 1, fixed_point_position);
     Tensor beta  = create_tensor<Tensor>(shape1, dt, 1, fixed_point_position);
@@ -79,10 +89,12 @@
 
     // Create and Configure function
     NEBatchNormalizationLayer norm;
-    norm.configure(&src, &dst, &mean, &var, &beta, &gamma, epsilon);
+    Tensor                   *beta_ptr  = use_beta ? &beta : nullptr;
+    Tensor                   *gamma_ptr = use_gamma ? &gamma : nullptr;
+    norm.configure(&src, &dst, &mean, &var, beta_ptr, gamma_ptr, epsilon);
 
     // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape0);
+    const ValidRegion valid_region = shape_to_valid_region(src_dst_shapes);
     validate(dst.info()->valid_region(), valid_region);
 }
 
@@ -150,9 +162,13 @@
 // *INDENT-ON*
 
 TEST_SUITE(Float)
-FIXTURE_DATA_TEST_CASE(Random, NEBatchNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(Random, NEBatchNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                                                                   combine(framework::dataset::make("UseBeta", { false, true }),
+                                                                                                                           framework::dataset::make("UseGamma", { false, true }))),
                                                                                                                    act_infos),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)))
+                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32, 0);
@@ -160,26 +176,34 @@
 TEST_SUITE_END()
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-TEST_SUITE(Float16)
-FIXTURE_DATA_TEST_CASE(Random, NEBatchNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(Random, NEBatchNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                                                                  combine(framework::dataset::make("UseBeta", { false, true }),
+                                                                                                                          framework::dataset::make("UseGamma", { false, true }))),
                                                                                                                   framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
-                                                                                                                  framework::dataset::make("DataType", DataType::F16)))
+                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16, 0);
 }
 TEST_SUITE_END()
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+TEST_SUITE_END()
 
 TEST_SUITE(Quantized)
 template <typename T>
 using NEBatchNormalizationLayerFixedPointFixture = BatchNormalizationLayerValidationFixedPointFixture<Tensor, Accessor, NEBatchNormalizationLayer, T>;
 
 TEST_SUITE(QS8)
-FIXTURE_DATA_TEST_CASE(Random, NEBatchNormalizationLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
-                       framework::dataset::make("DataType", DataType::QS8)),
-                       framework::dataset::make("FractionalBits", 1, 6)))
+FIXTURE_DATA_TEST_CASE(Random, NEBatchNormalizationLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                       framework::dataset::make("UseBeta", false)),
+                                                               framework::dataset::make("UseGamma", false)),
+                                                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                               framework::dataset::make("DataType", DataType::QS8)),
+                                       framework::dataset::make("DataLayout", DataLayout::NCHW)),
+                               framework::dataset::make("FractionalBits", 1, 6)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qs8, 0);
@@ -187,10 +211,14 @@
 TEST_SUITE_END()
 
 TEST_SUITE(QS16)
-FIXTURE_DATA_TEST_CASE(Random, NEBatchNormalizationLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
-                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
-                       framework::dataset::make("DataType", DataType::QS16)),
-                       framework::dataset::make("FractionalBits", 1, 14)))
+FIXTURE_DATA_TEST_CASE(Random, NEBatchNormalizationLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(datasets::RandomBatchNormalizationLayerDataset(),
+                                                                       framework::dataset::make("UseBeta", false)),
+                                                               framework::dataset::make("UseGamma", false)),
+                                                       framework::dataset::make("ActivationInfo", ActivationLayerInfo())),
+                                               framework::dataset::make("DataType", DataType::QS16)),
+                                       framework::dataset::make("DataLayout", DataLayout::NCHW)),
+                               framework::dataset::make("FractionalBits", 1, 14)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qs16, 0);

diff --git a/tests/validation/NEON/ChannelCombine.cpp b/tests/validation/NEON/ChannelCombine.cpp
new file mode 100644
index 0000000..7c05c88
--- /dev/null
+++ b/tests/validation/NEON/ChannelCombine.cpp

@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONCLCTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MultiImage.h"
+#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ConvertPolicyDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ChannelCombineFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+inline void validate_configuration(const TensorShape &shape, Format format)
+{
+    const int num_planes = num_planes_from_format(format);
+
+    // Create tensors
+    MultiImage          dst     = create_multi_image<MultiImage>(shape, format);
+    std::vector<Tensor> ref_src = create_tensor_planes<Tensor>(shape, format);
+
+    // Create and configure function
+    NEChannelCombine channel_combine;
+
+    if(num_planes == 1)
+    {
+        const Tensor *tensor_extra = Format::RGBA8888 == format ? &ref_src[3] : nullptr;
+
+        channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], tensor_extra, dst.plane(0));
+    }
+    else
+    {
+        channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], &dst);
+    }
+}
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(ChannelCombine)
+
+TEST_SUITE(Configuration)
+DATA_TEST_CASE(RGBA, framework::DatasetMode::ALL, combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })),
+               shape, format)
+{
+    validate_configuration(shape, format);
+}
+DATA_TEST_CASE(YUV, framework::DatasetMode::ALL, combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })),
+               shape, format)
+{
+    validate_configuration(shape, format);
+}
+
+DATA_TEST_CASE(YUVPlanar, framework::DatasetMode::ALL, combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444, Format::NV12, Format::NV21 })),
+               shape, format)
+{
+    validate_configuration(shape, format);
+}
+TEST_SUITE_END()
+
+template <typename T>
+using NEChannelCombineFixture = ChannelCombineValidationFixture<MultiImage, Tensor, Accessor, NEChannelCombine, T>;
+
+TEST_SUITE(RGBA)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+TEST_SUITE_END()
+
+TEST_SUITE(YUV)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+TEST_SUITE_END()
+
+TEST_SUITE(YUVPlanar)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::NV12, Format::NV21, Format::IYUV, Format::YUV444 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEChannelCombineFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("FormatType", { Format::NV12, Format::NV21, Format::IYUV, Format::YUV444 })))
+{
+    // Validate output
+    for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+    {
+        validate(Accessor(*_target.plane(plane_idx)), _reference[plane_idx]);
+    }
+}
+TEST_SUITE_END()
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/NEON/ConvertFullyConnectedWeights.cpp b/tests/validation/NEON/ConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..dbb81d6
--- /dev/null
+++ b/tests/validation/NEON/ConvertFullyConnectedWeights.cpp

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+auto params = combine(framework::dataset::make("WeightsWidth", { 16, 32, 64 }), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }));
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(ConvertFullyConnectedWeights)
+
+template <typename T>
+using NEConvertFullyConnectedWeightsFixture = ConvertFullyConnectedWeightsValidationFixture<Tensor, Accessor, NEConvertFullyConnectedWeights, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEConvertFullyConnectedWeightsFixture<float>, framework::DatasetMode::ALL, combine(datasets::Small3DShapes(), combine(params, framework::dataset::make("DataType",
+                                                                                                                    DataType::F32))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEConvertFullyConnectedWeightsFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::Large3DShapes(), combine(params, framework::dataset::make("DataType",
+                                                                                                                        DataType::F32))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEConvertFullyConnectedWeightsFixture<half>, framework::DatasetMode::ALL, combine(datasets::Small3DShapes(), combine(params, framework::dataset::make("DataType",
+                                                                                                                   DataType::F16))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEConvertFullyConnectedWeightsFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large3DShapes(), combine(params, framework::dataset::make("DataType",
+                                                                                                                       DataType::F16))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END()
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEConvertFullyConnectedWeightsFixture<uint8_t>, framework::DatasetMode::ALL, combine(datasets::Small3DShapes(), combine(params, framework::dataset::make("DataType",
+                                                                                                                      DataType::QASYMM8))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEConvertFullyConnectedWeightsFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(datasets::Large3DShapes(), combine(params,
+                       framework::dataset::make("DataType",
+                                                DataType::QASYMM8))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index eabc6ad..776d1ae 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp

@@ -24,7 +24,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
@@ -37,7 +37,7 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ConvolutionLayerFixture.h"
-#include "tests/validation/fixtures/WinogradLayerFixture.h"
+#include "tests/validation/fixtures/WinogradConvolutionLayerFixture.h"
 
 namespace arm_compute
 {
@@ -47,7 +47,7 @@
 {
 namespace
 {
-const AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+const AbsoluteTolerance<float> tolerance_f32(0.002f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 const AbsoluteTolerance<float> tolerance_f16(0.01f);       /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 #endif                                                     /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -65,54 +65,76 @@
     DataType::QS16,
     DataType::QASYMM8,
 });
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f)
+});
 } // namespace
 
 TEST_SUITE(NEON)
 
 TEST_SUITE(ConvolutionLayer)
 DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-                                                                                           framework::dataset::make("InputInfo", { TensorInfo(TensorShape(8U, 8U, 2U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
+                                                                                           framework::dataset::make("InputInfo", { TensorInfo(TensorShape(18U, 18U, 32U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(23U, 27U, 32U, 4U), 1, DataType::F32, 0),
                                                                                                                     TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
                                                                                                                     TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
                                                                                                                                  }),
-                                                                                           framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
-                                                                                                                    TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+                                                                                           framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 32U, 21U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(5U, 5U, 32U, 21U), 1, DataType::F32, 0),
                                                                                                                     TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
                                                                                                                     TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
                                                                                                                                    })),
-                                                                                       framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(1U), 1, DataType::F32, 0),
-                                                                                                                TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
-                                                                                                                TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
-                                                                                                                TensorInfo(TensorShape(16U), 1, DataType::F32, 0)
+                                                                                       framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(16U, 16U, 21U), 1, DataType::F32, 0),
+                                                                                                                TensorInfo(TensorShape(19U, 23U, 21U, 4U), 1, DataType::F32, 0),
+                                                                                                                TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                                                                                                TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0)
                                                                                                                               })),
-                                                                                   framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(6U, 6U, 1U), 1, DataType::F32, 0),
-                                                                                                            TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
-                                                                                                            TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
-                                                                                                            TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0)
-                                                                                                                          })),
-                                                                               framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
-                                                                                                                      PadStrideInfo(1, 1, 0, 0),
-                                                                                                                      PadStrideInfo(2, 1, 0, 0),
-                                                                                                                      PadStrideInfo(3, 2, 1, 0)
+                                                                                   framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                                                                                            PadStrideInfo(1, 1, 0, 0),
+                                                                                                            PadStrideInfo(2, 1, 0, 0),
+                                                                                                            PadStrideInfo(3, 2, 1, 0)
+                                                                                                                        })),
+                                                                               framework::dataset::make("FastMath", { true,
+                                                                                                                      true,
+                                                                                                                      false,
+                                                                                                                      false
                                                                                                                     })),
                                                                            framework::dataset::make("Expected", { ConvolutionMethod::WINOGRAD, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
-               input_info, weights_info, biases_info, output_info, conv_info, expected)
+               input_info, weights_info, output_info, conv_info, fast_math, expected)
 {
-    ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
-                                                                            &weights_info.clone()->set_is_resizable(false),
-                                                                            &biases_info.clone()->set_is_resizable(false),
-                                                                            &output_info.clone()->set_is_resizable(false), conv_info);
+    ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(true),
+                                                                            &weights_info.clone()->set_is_resizable(true),
+                                                                            &output_info.clone()->set_is_resizable(true), conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), fast_math);
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 TEST_SUITE_END()
 
 TEST_SUITE(WinogradLayer)
 template <typename T>
-using NEWinogradLayerFixture = WinogradLayerValidationFixture<Tensor, Accessor, NEWinogradLayer, T>;
+using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T>;
+
+template <typename T>
+using NEWinogradConvolutionLayerNoBiasFixture = WinogradConvolutionLayerFastMathValidationFixture<Tensor, Accessor, NEWinogradConvolutionLayer, T, false>;
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradLayerFixture<float>, framework::DatasetMode::PRECOMMIT, datasets::SmallWinogradLayerDataset())
+FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                                                  datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallNoBias, NEWinogradConvolutionLayerNoBiasFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                                                  datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -123,8 +145,11 @@
 
 TEST_SUITE(GEMMConvolutionLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallConvolutionLayerDataset(), datasets::LargeConvolutionLayerDataset()), CNNDataTypes),
-               input_shape, weights_shape, bias_shape, output_shape, info, data_type)
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallConvolutionLayerDataset(), datasets::LargeConvolutionLayerDataset()),
+                                                                           CNNDataTypes),
+                                                                   framework::dataset::make("ActivationInfo",
+{ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) })),
+input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type, act_info)
 {
     // Set fixed point position data type allowed
     int fixed_point_position = is_data_type_fixed_point(data_type) ? 3 : 0;
@@ -147,7 +172,7 @@
 
     // Create and configure function
     NEGEMMConvolutionLayer conv;
-    conv.configure(&src, &weights, &bias, &dst, info);
+    conv.configure(&src, &weights, &bias, &dst, info, WeightsInfo(), dilation, act_info);
 
     // Validate valid region
     const ValidRegion src_valid_region     = shape_to_valid_region(input_shape);
@@ -171,16 +196,20 @@
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                 framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                                 framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                 framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                 ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeConvolutionLayerDataset(),
-                                                                                                                       framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                               framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeConvolutionLayerDataset(),
+                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                               ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -189,16 +218,20 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallConvolutionLayerDataset(),
-                                                                                                                  framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                                  framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                  framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                  framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                  ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeConvolutionLayerDataset(),
-                                                                                                                        framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                                framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeConvolutionLayerDataset(),
+                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -212,18 +245,20 @@
 TEST_SUITE(FixedPoint)
 TEST_SUITE(QS8)
 // We test for fixed point precision [4,6]
-FIXTURE_DATA_TEST_CASE(RunTiny, NEGEMMConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::TinyConvolutionLayerDataset(),
-                       framework::dataset::make("ReshapeWeights", { true, false })),
+FIXTURE_DATA_TEST_CASE(RunTiny, NEGEMMConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::TinyConvolutionLayerDataset(),
+                       framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType", DataType::QS8)),
-                       framework::dataset::make("FractionalBits", 4, 7)))
+                       framework::dataset::make("FractionalBits", 4, 7)),
+                       ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_q);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                       framework::dataset::make("ReshapeWeights", { true, false })),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                       framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType", DataType::QS8)),
-                       framework::dataset::make("FractionalBits", 4, 7)))
+                       framework::dataset::make("FractionalBits", 4, 7)),
+                       ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_q);
@@ -232,18 +267,20 @@
 
 TEST_SUITE(QS16)
 // Testing for fixed point position [1,14)
-FIXTURE_DATA_TEST_CASE(RunTiny, NEGEMMConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::TinyConvolutionLayerDataset(),
-                       framework::dataset::make("ReshapeWeights", { true, false })),
+FIXTURE_DATA_TEST_CASE(RunTiny, NEGEMMConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::TinyConvolutionLayerDataset(),
+                       framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType", DataType::QS16)),
-                       framework::dataset::make("FractionalBits", 1, 14)))
+                       framework::dataset::make("FractionalBits", 1, 14)),
+                       ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_q);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
-                       framework::dataset::make("ReshapeWeights", { true, false })),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                       framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType", DataType::QS16)),
-                       framework::dataset::make("FractionalBits", 1, 14)))
+                       framework::dataset::make("FractionalBits", 1, 14)),
+                       ActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_q);
@@ -254,20 +291,28 @@
 template <typename T>
 using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
 
+const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                        framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })))
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                       QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeConvolutionLayerDataset(),
                        framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })))
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                       QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);

diff --git a/tests/validation/NEON/DeconvolutionLayer.cpp b/tests/validation/NEON/DeconvolutionLayer.cpp
index 566b75a..3bb6d6f 100644
--- a/tests/validation/NEON/DeconvolutionLayer.cpp
+++ b/tests/validation/NEON/DeconvolutionLayer.cpp

@@ -58,6 +58,106 @@
 TEST_SUITE(NEON)
 TEST_SUITE(DeconvolutionLayer)
 
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, (combine(datasets::SmallDeconvolutionShapes(), framework::dataset::make("DataType", DataType::F32))),
+               input_shape, data_type)
+{
+    // Create shapes
+    const unsigned int kernel_size_x = 3;
+    const unsigned int kernel_size_y = 3;
+    const unsigned int num_kernels   = 1;
+    const TensorShape  weights_shape(kernel_size_x, kernel_size_y, input_shape.z(), num_kernels);
+    const TensorShape  bias_shape(num_kernels);
+    auto               out_dim      = deconvolution_output_dimensions(input_shape.x(), input_shape.y(), kernel_size_x, kernel_size_y, 1, 1, 0, 0, 1, 1);
+    TensorShape        output_shape = deconvolution_output_shape(out_dim, input_shape, weights_shape);
+
+    // Create tensors
+    Tensor src     = create_tensor<Tensor>(input_shape, data_type, 1);
+    Tensor weights = create_tensor<Tensor>(weights_shape, data_type, 1);
+    Tensor bias    = create_tensor<Tensor>(bias_shape, data_type, 1);
+    Tensor dst     = create_tensor<Tensor>(output_shape, data_type, 1);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    NEDeconvolutionLayer deconv;
+    deconv.configure(&src, &weights, &bias, &dst, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), 0, 0);
+
+    // Validate valid region
+    const ValidRegion src_valid_region     = shape_to_valid_region(input_shape);
+    const ValidRegion weights_valid_region = shape_to_valid_region(weights_shape);
+    const ValidRegion bias_valid_region    = shape_to_valid_region(bias_shape);
+    const ValidRegion dst_valid_region     = shape_to_valid_region(output_shape);
+
+    validate(src.info()->valid_region(), src_valid_region);
+    validate(weights.info()->valid_region(), weights_valid_region);
+    validate(bias.info()->valid_region(), bias_valid_region);
+    validate(dst.info()->valid_region(), dst_valid_region);
+}
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
+    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),   // Mismatching data type
+                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),   // Invalid weights shape
+                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QS8, 4),   // Non supported data type
+                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 11),  // Invalid bias shape
+                                            TensorInfo(TensorShape(13U, 11U, 4U, 3U), 1, DataType::F32, 0), // Window shrink
+                                            TensorInfo(TensorShape(32U, 16U, 2U), 1, DataType::F32, 0),
+                                          }),
+    framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::F16, 0),
+                                            TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32, 0),
+                                            TensorInfo(TensorShape(3U, 3U, 2U, 2U), 1, DataType::QS8, 5),
+                                            TensorInfo(TensorShape(3U, 2U, 2U, 2U), 1, DataType::F32, 11),
+                                            TensorInfo(TensorShape(3U, 3U, 4U), 1, DataType::F32, 0),
+                                              TensorInfo(TensorShape(1U, 1U, 2U, 4U), 1, DataType::F32, 0),
+                                          })),
+    framework::dataset::make("BiasInfo",  { TensorInfo(TensorShape(1U), 1, DataType::F16, 0),
+                                            TensorInfo(TensorShape(1U), 1, DataType::F32, 0),
+                                            TensorInfo(TensorShape(1U), 1, DataType::F32, 5),
+                                            TensorInfo(TensorShape(25U, 11U), 1, DataType::F32, 11),
+                                            TensorInfo(TensorShape(1U), 1, DataType::F32, 0),
+                                            TensorInfo(TensorShape(4U), 1, DataType::F32, 0),
+                                          })),
+    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16, 0),
+                                            TensorInfo(TensorShape(25U, 10U, 2U), 1, DataType::F32, 0),
+                                            TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32, 5),
+                                            TensorInfo(TensorShape(13U, 13U, 2U), 1, DataType::F32, 0),
+                                            TensorInfo(TensorShape(11U, 9U, 1U, 3U), 1, DataType::F32, 0),
+                                            TensorInfo(TensorShape(32U, 16U, 4U), 1, DataType::F32, 0),
+                                          })),
+    framework::dataset::make("PadStrideInfo", { PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                                PadStrideInfo(1, 1, 1, 1),
+                                                PadStrideInfo(1, 1, 0, 0),
+                                           })),
+    framework::dataset::make("ax",          {   1U,
+                                                1U,
+                                                1U,
+                                                1U,
+                                                0U,
+                                                0U,
+                                            })),
+   framework::dataset::make("ay",           {   1U,
+                                                1U,
+                                                1U,
+                                                1U,
+                                                0U,
+                                                0U,
+                                            })),
+    framework::dataset::make("Expected", { false, false, false, false, false, true })),
+    input_info, weights_info, bias_info, output_info, pad_info, ax, ay, expected)
+{
+    bool is_valid = bool(NEDeconvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pad_info, ax, ay));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
 template <typename T>
 using NEDeconvolutionLayerFixture4x4 = DeconvolutionValidationFixture<Tensor, Accessor, NEDeconvolutionLayer, T, 4, 4>;
 

diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 0cdd4c0..b1cc491 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
@@ -40,20 +41,34 @@
 {
 namespace validation
 {
+using namespace arm_compute::misc::shape_calculator;
+
 namespace
 {
 constexpr RelativeTolerance<float>   tolerance_f32(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */
+
+const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 3 });
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(DepthwiseConvLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                                              datasets::LargeDepthwiseConvolutionLayerDataset3x3()),
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                      datasets::LargeDepthwiseConvolutionLayerDataset3x3()),
+                                                                           depth_multipliers),
                                                                    framework::dataset::make("DataType", DataType::F32)),
-               input_shape, weights_shape, output_shape, info, data_type)
+               input_shape, kernel_size, info, depth_multiplier, data_type)
 {
+    // Get shapes
+    TensorShape weights_shape(kernel_size.width, kernel_size.height);
+
+    const TensorInfo  in_info(input_shape, 1, data_type);
+    const TensorInfo  we_info(weights_shape, 1, data_type);
+    const TensorShape output_shape = compute_depthwise_convolution_shape(in_info, we_info, info, depth_multiplier);
+
+    weights_shape.set(2, output_shape.z());
+
     // Create tensors
     Tensor            src     = create_tensor<Tensor>(input_shape, data_type);
     Tensor            dst     = create_tensor<Tensor>(output_shape, data_type);
@@ -68,7 +83,7 @@
 
     // Create and configure function
     NEDepthwiseConvolutionLayer3x3 depthwise_layer;
-    depthwise_layer.configure(&src, &weights, &bias, &dst, info);
+    depthwise_layer.configure(&src, &weights, &bias, &dst, info, depth_multiplier);
 
     // Validate valid region
     const ValidRegion input_valid_region   = shape_to_valid_region(input_shape);
@@ -82,7 +97,7 @@
     validate(bias.info()->valid_region(), bias_valid_region);
 
     // Validate padding
-    bool              is_optimized_run = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input_shape, info, data_type, DataLayout::NCHW);
+    bool              is_optimized_run = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input_shape, info, data_type, depth_multiplier, DataLayout::NCHW);
     const int         step_non_opt_dwc = 16 >> info.stride().first;
     const int         step_bias_add    = 16 / src.info()->element_size();
     const int         step             = is_optimized_run ? step_bias_add : std::max(step_non_opt_dwc, step_bias_add);
@@ -95,15 +110,19 @@
 TEST_SUITE(Generic)
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer, T>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                                                                                                                       depth_multipliers),
                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                               DataType::F32)))
+                                                                                                                               DataType::F32)),
+                                                                                                                       framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+                                                                                                                     depth_multipliers),
                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                             DataType::F32)))
+                                                                                                                             DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -112,21 +131,27 @@
 TEST_SUITE(W3x3)
 template <typename T>
 using NEDepthwiseConvolutionLayerFixture3x3 = DepthwiseConvolutionLayerValidationFixture<Tensor, Accessor, NEDepthwiseConvolutionLayer3x3, T>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                    depth_multipliers),
                                                                                                                     framework::dataset::make("DataType",
-                                                                                                                            DataType::F32)))
+                                                                                                                            DataType::F32)),
+                                                                                                                    framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                        depth_multipliers),
                                                                                                                         framework::dataset::make("DataType",
-                                                                                                                                DataType::F32)))
+                                                                                                                                DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunOptimized, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(datasets::OptimizedDepthwiseConvolutionLayerDataset3x3(),
+FIXTURE_DATA_TEST_CASE(RunOptimized, NEDepthwiseConvolutionLayerFixture3x3<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::OptimizedDepthwiseConvolutionLayerDataset3x3(),
+                                                                                                                        framework::dataset::make("DepthMultiplier", 1)),
                                                                                                                         framework::dataset::make("DataType",
-                                                                                                                                DataType::F32)))
+                                                                                                                                DataType::F32)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     validate(Accessor(_target), _reference, tolerance_f32);
 }
@@ -143,23 +168,31 @@
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 TEST_SUITE(Generic)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(),
+                                                       depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END()
 TEST_SUITE(W3x3)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
-                       framework::dataset::make("DataType", DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
+                                                       depth_multipliers),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+                               framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }

diff --git a/tests/validation/NEON/DilatedConvolutionLayer.cpp b/tests/validation/NEON/DilatedConvolutionLayer.cpp
new file mode 100644
index 0000000..c65e993
--- /dev/null
+++ b/tests/validation/NEON/DilatedConvolutionLayer.cpp

@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/DilatedConvolutionLayerDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/ConvolutionLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+const AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+const AbsoluteTolerance<float> tolerance_f16(0.01f);       /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+#endif                                                     /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+const AbsoluteTolerance<float>     tolerance_q(1.0f);      /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+
+/** CNN data types */
+const auto CNNDataTypes = framework::dataset::make("DataType",
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    DataType::F16,
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    DataType::F32,
+    DataType::QS8,
+    DataType::QS16,
+    DataType::QASYMM8,
+});
+} // namespace
+
+TEST_SUITE(NEON)
+
+TEST_SUITE(DilatedConvolutionLayer)
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+                                                                                           framework::dataset::make("InputInfo", { TensorInfo(TensorShape(8U, 8U, 2U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
+                                                                                                                                 }),
+                                                                                           framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+                                                                                                                    TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
+                                                                                                                                   })),
+                                                                                       framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(6U, 6U, 1U), 1, DataType::F32, 0),
+                                                                                                                TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
+                                                                                                                TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                                                                                                TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32, 0)
+                                                                                                                              })),
+                                                                                   framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                                                                                            PadStrideInfo(1, 1, 0, 0),
+                                                                                                            PadStrideInfo(2, 1, 0, 0),
+                                                                                                            PadStrideInfo(3, 2, 1, 0)
+                                                                                                                        })),
+                                                                               framework::dataset::make("Dilation", { Size2D(1U, 2U),
+                                                                                                                      Size2D(2U, 1U),
+                                                                                                                      Size2D(2U, 2U),
+                                                                                                                      Size2D(3U, 3U)
+                                                                                                                    })),
+                                                                           framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
+               input_info, weights_info, output_info, conv_info, dilation, expected)
+{
+    ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
+                                                                            &weights_info.clone()->set_is_resizable(false),
+                                                                            &output_info.clone()->set_is_resizable(false),
+                                                                            conv_info, WeightsInfo(), dilation);
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(GEMMDilatedConvolutionLayer)
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallDilatedConvolutionLayerDataset(), datasets::LargeDilatedConvolutionLayerDataset()),
+                                                                   CNNDataTypes),
+               input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type)
+{
+    // Set fixed point position data type allowed
+    int fixed_point_position = is_data_type_fixed_point(data_type) ? 3 : 0;
+
+    auto bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+
+    // Create tensors
+    Tensor src     = create_tensor<Tensor>(input_shape, data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127));
+    Tensor weights = create_tensor<Tensor>(weights_shape, data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127));
+    Tensor bias    = create_tensor<Tensor>(bias_shape, bias_data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127));
+    Tensor dst     = create_tensor<Tensor>(output_shape, data_type, 1, fixed_point_position, QuantizationInfo(2.f / 255.f, 127));
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    const QuantizationInfo src_quantization_info     = src.info()->quantization_info();
+    const QuantizationInfo weights_quantization_info = weights.info()->quantization_info();
+
+    // Create and configure function
+    NEGEMMConvolutionLayer conv;
+    conv.configure(&src, &weights, &bias, &dst, info, WeightsInfo(), dilation);
+
+    // Validate valid region
+    const ValidRegion src_valid_region     = shape_to_valid_region(input_shape);
+    const ValidRegion weights_valid_region = shape_to_valid_region(weights_shape);
+    const ValidRegion bias_valid_region    = shape_to_valid_region(bias_shape);
+    const ValidRegion dst_valid_region     = shape_to_valid_region(output_shape);
+
+    validate(src.info()->valid_region(), src_valid_region);
+    validate(weights.info()->valid_region(), weights_valid_region);
+    validate(bias.info()->valid_region(), bias_valid_region);
+    validate(dst.info()->valid_region(), dst_valid_region);
+
+    // Validate QuantizationInfo
+    ARM_COMPUTE_EXPECT(src.info()->quantization_info() == src_quantization_info, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(weights.info()->quantization_info() == weights_quantization_info, framework::LogLevel::ERRORS);
+}
+
+template <typename T>
+using NEGEMMDilatedConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T>;
+
+TEST_SUITE(Float)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMDilatedConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
+                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                        framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMDilatedConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDilatedConvolutionLayerDataset(),
+                                                                                                                      framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                                                                                      framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END()
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMDilatedConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
+                       framework::dataset::make("ReshapeWeights", { true })),
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                       framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMDilatedConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDilatedConvolutionLayerDataset(),
+                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                       framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+template <typename T>
+using NEGEMMDilatedConvolutionLayerFixedPointFixture = ConvolutionValidationFixedPointFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
+
+TEST_SUITE(FixedPoint)
+TEST_SUITE(QS8)
+// We test for fixed point precision [4,6]
+FIXTURE_DATA_TEST_CASE(RunTiny, NEGEMMDilatedConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::TinyDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType", DataType::QS8)),
+                                       framework::dataset::make("FractionalBits", 4, 7)),
+                               framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_q);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMDilatedConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType", DataType::QS8)),
+                                       framework::dataset::make("FractionalBits", 4, 7)),
+                               framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_q);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(QS16)
+// Testing for fixed point position [1,14)
+FIXTURE_DATA_TEST_CASE(RunTiny, NEGEMMDilatedConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::TinyDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType", DataType::QS16)),
+                                       framework::dataset::make("FractionalBits", 1, 14)),
+                               framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_q);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMDilatedConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType", DataType::QS16)),
+                                       framework::dataset::make("FractionalBits", 1, 14)),
+                               framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_q);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+template <typename T>
+using NEGEMMDilatedConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMDilatedConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::SmallDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                               framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMDilatedConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(datasets::LargeDilatedConvolutionLayerDataset(),
+                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                               framework::dataset::make("DataType", DataType::QASYMM8)),
+                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                               framework::dataset::make("ActivationLayerInfo", ActivationLayerInfo())))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index 77f2892..f4c7693 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp

@@ -49,43 +49,51 @@
 constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f); /**< Tolerance for floating point tests */
 
 /** Direct convolution data set. */
-const auto data_pad_f32 = concat(concat(combine(framework::dataset::make("PadX", 0, 1),
-                                                combine(framework::dataset::make("PadY", 0, 1),
-                                                        framework::dataset::make("KernelSize", 1))),
-                                        combine(framework::dataset::make("PadX", 0, 2),
-                                                combine(framework::dataset::make("PadY", 0, 2),
+const auto data_pad_f32 = concat(concat(combine(framework::dataset::make("PadX", { 0, 1 }),
+                                                combine(framework::dataset::make("PadY", { 0, 1 }),
+                                                        framework::dataset::make("KernelSize", 3))),
+                                        combine(framework::dataset::make("PadX", { 0, 2 }),
+                                                combine(framework::dataset::make("PadY", { 0, 2 }),
                                                         framework::dataset::make("KernelSize", 3)))),
-                                 combine(framework::dataset::make("PadX", 0, 3),
-                                         combine(framework::dataset::make("PadY", 0, 3),
+                                 combine(framework::dataset::make("PadX", { 0, 3 }),
+                                         combine(framework::dataset::make("PadY", { 0, 3 }),
                                                  framework::dataset::make("KernelSize", 5))));
 
 const auto data_pad_qs8 = concat(combine(framework::dataset::make("PadX", 0),
                                          combine(framework::dataset::make("PadY", 0),
                                                  framework::dataset::make("KernelSize", 1))),
-                                 combine(framework::dataset::make("PadX", 0, 2),
-                                         combine(framework::dataset::make("PadY", 0, 2),
+                                 combine(framework::dataset::make("PadX", { 0, 2 }),
+                                         combine(framework::dataset::make("PadY", { 0, 2 }),
                                                  framework::dataset::make("KernelSize", 3))));
 
 const auto data_f32 = combine(datasets::SmallDirectConvolutionShapes(),
-                              combine(framework::dataset::make("StrideX", 1, 3),
-                                      combine(framework::dataset::make("StrideY", 1, 3),
+                              combine(framework::dataset::make("StrideX", { 1, 3 }),
+                                      combine(framework::dataset::make("StrideY", { 1, 3 }),
                                               combine(data_pad_f32,
                                                       framework::dataset::make("NumKernels", { 1, 4, 8, 16 })))));
 
 const auto data_qs8 = combine(datasets::TinyDirectConvolutionShapes(),
-                              combine(framework::dataset::make("StrideX", 1, 3),
-                                      combine(framework::dataset::make("StrideY", 1, 3),
+                              combine(framework::dataset::make("StrideX", { 1, 3 }),
+                                      combine(framework::dataset::make("StrideY", { 1, 3 }),
                                               combine(data_pad_qs8,
                                                       framework::dataset::make("NumKernels", { 1, 4, 8, 16 })))));
 
 /** Direct convolution QS16 data set. */
 const auto data_qs16 = combine(datasets::TinyDirectConvolutionShapes(),
-                               combine(framework::dataset::make("StrideX", 1, 3),
-                                       combine(framework::dataset::make("StrideY", 1, 3),
+                               combine(framework::dataset::make("StrideX", { 1, 3 }),
+                                       combine(framework::dataset::make("StrideY", { 1, 3 }),
                                                combine(framework::dataset::make("PadX", 0),
                                                        combine(framework::dataset::make("PadY", 0),
                                                                combine(framework::dataset::make("KernelSize", 1),
                                                                        framework::dataset::make("NumKernels", { 1, 4, 8, 16 })))))));
+/** Activation function Dataset*/
+const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 0.5f)
+});
 } // namespace
 
 TEST_SUITE(NEON)
@@ -93,7 +101,7 @@
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
         framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0), // Mismatching data type input/weights
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0), // Mismatching input feature maps
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0), // Unsupported kernel width
@@ -144,10 +152,15 @@
                                                 PadStrideInfo(1, 1, 0, 0),
                                                 PadStrideInfo(1, 1, 0, 0),
                                                })),
-        framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false })),
-        input_info, weights_info, biases_info, output_info, conv_info, expected)
+                                                       framework::dataset::make("ActivationInfo",
 {
-        bool is_valid = bool(NEDirectConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info));
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+})),
+        framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false })),
+        input_info, weights_info, biases_info, output_info, conv_info, act_info, expected)
+{
+        bool is_valid = bool(NEDirectConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, act_info));
         ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
@@ -159,7 +172,9 @@
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(data_f32, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(data_f32, framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                ActivationFunctionsDataset),
+                                                                                                        framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16);
@@ -168,7 +183,9 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(data_f32, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(data_f32, framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                 ActivationFunctionsDataset),
+                                                                                                         framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
@@ -179,11 +196,19 @@
 template <typename T>
 using NEDirectConvolutionLayerFixedPointFixture = DirectConvolutionValidationFixedPointFixture<Tensor, Accessor, NEDirectConvolutionLayer, T>;
 
+const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QS8)
 // We test for fixed point precision [4,6]
-FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(data_qs8, framework::dataset::make("DataType", DataType::QS8)),
-                                                                                                                    framework::dataset::make("FractionalBits", 4, 7)))
+FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixedPointFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(data_qs8, framework::dataset::make("DataType", DataType::QS8)),
+                                                                                                                    framework::dataset::make("FractionalBits", 4, 7)),
+                                                                                                                    QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qs);
@@ -192,8 +217,9 @@
 
 TEST_SUITE(QS16)
 // We test for fixed point precision [4,13]
-FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(data_qs16, framework::dataset::make("DataType", DataType::QS16)),
-                                                                                                                     framework::dataset::make("FractionalBits", 4, 14)))
+FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixedPointFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(combine(data_qs16, framework::dataset::make("DataType", DataType::QS16)),
+                                                                                                                     framework::dataset::make("FractionalBits", 4, 14)),
+                                                                                                                     QuantizedActivationFunctionsDataset))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qs);

diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index a901b44..ed24d61 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
@@ -39,7 +37,6 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/GEMMInterleaveBlockedFixture.h"
 #include "tests/validation/fixtures/GEMMLowpAssemblyFixture.h"
 #include "tests/validation/fixtures/GEMMLowpFixture.h"
 
@@ -51,8 +48,6 @@
 {
 namespace
 {
-const auto data_int_blk         = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12) * framework::dataset::make("by", 8, 13) * framework::dataset::make("block", 4, 9);
-const auto data_int_blk_tr      = framework::dataset::make("M", 8, 17) * framework::dataset::make("N", 8, 14) * framework::dataset::make("by", 12) * framework::dataset::make("block", 4);
 const auto data_matrix_multiply = framework::dataset::make("M", 12, 20) * framework::dataset::make("N", 12, 20) * framework::dataset::make("K", 16);
 } // namespace
 
@@ -80,29 +75,6 @@
 TEST_SUITE_END()
 
 TEST_SUITE(GEMMLowp)
-
-TEST_SUITE(INTERLEAVE_BLOCKED)
-
-using NEInterleaveBlocked            = NESynthetizeFunction<NEGEMMInterleaveBlockedKernel>;
-using NEGEMMInterleaveBlockedFixture = GEMMInterleaveBlockedValidationFixture<Tensor, Accessor, NEInterleaveBlocked>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleaveBlockedFixture, framework::DatasetMode::PRECOMMIT, data_int_blk)
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-
-TEST_SUITE(INTERLEAVE_BLOCKED_TRANSPOSED)
-using NEInterleaveBlockedTransposed            = NESynthetizeFunction<NEGEMMInterleaveBlockedKernel>;
-using NEGEMMInterleaveBlockedTransposedFixture = GEMMInterleaveBlockedValidationFixture<Tensor, Accessor, NEInterleaveBlockedTransposed, true>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleaveBlockedTransposedFixture, framework::DatasetMode::PRECOMMIT, data_int_blk_tr)
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END()
-
 TEST_SUITE(MatrixMultiplyCore)
 using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
 

diff --git a/tests/validation/NEON/GaussianPyramid.cpp b/tests/validation/NEON/GaussianPyramid.cpp
index 6fee0dd..c646b50 100644
--- a/tests/validation/NEON/GaussianPyramid.cpp
+++ b/tests/validation/NEON/GaussianPyramid.cpp

@@ -44,13 +44,11 @@
 {
 namespace
 {
-constexpr AbsoluteTolerance<float> tolerance_fp32(1.0f); /**< Tolerance value for comparing reference's output against implementation's output */
-
-const auto small_gaussian_pyramid_levels = combine(datasets::Medium2DShapes(), datasets::BorderModes()) * framework::dataset::make("numlevels", 2, 3);
+const auto small_gaussian_pyramid_levels = combine(datasets::Medium2DShapes(), datasets::BorderModes()) * framework::dataset::make("numlevels", 2, 4);
 const auto large_gaussian_pyramid_levels = combine(datasets::Large2DShapes(), datasets::BorderModes()) * framework::dataset::make("numlevels", 2, 5);
 
-template <typename T, typename U>
-inline void validate_gaussian_pyramid(const Pyramid &target, const std::vector<SimpleTensor<T>> &reference, BorderMode border_mode, U tolerance, float tolerance_number = 0.0f)
+template <typename T>
+inline void validate_gaussian_pyramid(const Pyramid &target, const std::vector<SimpleTensor<T>> &reference, BorderMode border_mode)
 {
     ValidRegion prev_valid_region = shape_to_valid_region(reference[0].shape());
 
@@ -59,7 +57,7 @@
         const ValidRegion valid_region = shape_to_valid_region_gaussian_pyramid_half(reference[i - 1].shape(), prev_valid_region, (border_mode == BorderMode::UNDEFINED));
 
         // Validate outputs
-        validate(Accessor(*(target.get_pyramid_level(i))), reference[i], valid_region, tolerance, tolerance_number);
+        validate(Accessor(*(target.get_pyramid_level(i))), reference[i], valid_region);
 
         // Keep the valid region for the next level
         prev_valid_region = valid_region;
@@ -97,12 +95,12 @@
 
 FIXTURE_DATA_TEST_CASE(RunSmallGaussianPyramidHalf, NEGaussianPyramidHalfFixture<uint8_t>, framework::DatasetMode::ALL, small_gaussian_pyramid_levels)
 {
-    validate_gaussian_pyramid(_target, _reference, _border_mode, tolerance_fp32);
+    validate_gaussian_pyramid(_target, _reference, _border_mode);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLargeGaussianPyramidHalf, NEGaussianPyramidHalfFixture<uint8_t>, framework::DatasetMode::NIGHTLY, large_gaussian_pyramid_levels)
 {
-    validate_gaussian_pyramid(_target, _reference, _border_mode, tolerance_fp32, 0.01f);
+    validate_gaussian_pyramid(_target, _reference, _border_mode);
 }
 TEST_SUITE_END()
 TEST_SUITE_END()

diff --git a/tests/validation/NEON/GlobalPoolingLayer.cpp b/tests/validation/NEON/GlobalPoolingLayer.cpp
index 37950b0..7697806 100644
--- a/tests/validation/NEON/GlobalPoolingLayer.cpp
+++ b/tests/validation/NEON/GlobalPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,9 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunGlobalPooling, NEGlobalPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunGlobalPooling, NEGlobalPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(GlobalPoolingLayerDataset, framework::dataset::make("DataType",
+                                                                                                                  DataType::F32)),
+                                                                                                                  framework::dataset::make("DataLayout", DataLayout::NCHW)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);

diff --git a/tests/validation/NEON/HOGDetector.cpp b/tests/validation/NEON/HOGDetector.cpp
new file mode 100644
index 0000000..c787728
--- /dev/null
+++ b/tests/validation/NEON/HOGDetector.cpp

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/NEON/ArrayAccessor.h"
+#include "tests/NEON/HOGAccessor.h"
+#include "tests/datasets/HOGDescriptorDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/HOGDetectorFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/* Set the tolerance (percentage) used when validating the score of detection window.
+   Note: High tolerance is required due to divergence between CL and NEON detection window scores. */
+RelativeTolerance<float> tolerance(1.0f);
+
+/* Input dataset (values must be a multiple of the HOGInfo block_size) */
+const auto DetectionWindowStrideDataset = framework::dataset::make("DetectionWindowStride", { Size2D(8, 8), Size2D(16, 16) });
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(HOGDetector)
+
+// *INDENT-OFF*
+// clang-format off
+using NEHOGDetectorFixture = HOGDetectorValidationFixture<Tensor,
+                                                          HOG,
+                                                          DetectionWindowArray,
+                                                          NEHOGDescriptor,
+                                                          Accessor,
+                                                          ArrayAccessor<DetectionWindow>,
+                                                          HOGAccessor,
+                                                          NEHOGDetector,
+                                                          uint8_t,
+                                                          float>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEHOGDetectorFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(
+                       DetectionWindowStrideDataset,
+                       datasets::SmallHOGDescriptorDataset()),
+                       framework::dataset::make("Format", Format::U8)),
+                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
+{
+    // Validate output
+    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEHOGDetectorFixture, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(
+                       DetectionWindowStrideDataset,
+                       datasets::LargeHOGDescriptorDataset()),
+                       framework::dataset::make("Format", Format::U8)),
+                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})))
+{
+    // Validate output
+    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
+}
+
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/NEON/HOGMultiDetection.cpp b/tests/validation/NEON/HOGMultiDetection.cpp
new file mode 100644
index 0000000..d6017e0
--- /dev/null
+++ b/tests/validation/NEON/HOGMultiDetection.cpp

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/MultiHOG.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/NEON/ArrayAccessor.h"
+#include "tests/NEON/HOGAccessor.h"
+#include "tests/datasets/HOGMultiDetectionDataset.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/HOGMultiDetectionFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/* Set the tolerance (percentage) used when validating the strength of detection window. */
+RelativeTolerance<float> tolerance(1.0f);
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(HOGMultiDetection)
+
+// *INDENT-OFF*
+// clang-format off
+using NEHOGMultiDetectionFixture = HOGMultiDetectionValidationFixture<Tensor,
+                                                                      HOG,
+                                                                      MultiHOG,
+                                                                      DetectionWindowArray,
+                                                                      Size2DArray,
+                                                                      Accessor,
+                                                                      ArrayAccessor<Size2D>,
+                                                                      ArrayAccessor<DetectionWindow>,
+                                                                      HOGAccessor,
+                                                                      NEHOGMultiDetection,
+                                                                      uint8_t,
+                                                                      float>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEHOGMultiDetectionFixture, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(
+                       datasets::SmallHOGMultiDetectionDataset(),
+                       framework::dataset::make("Format", Format::U8)),
+                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})),
+                       framework::dataset::make("NonMaximaSuppression", {false, true})))
+{
+    // Validate output
+    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEHOGMultiDetectionFixture, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(
+                       datasets::LargeHOGMultiDetectionDataset(),
+                       framework::dataset::make("Format", Format::U8)),
+                       framework::dataset::make("BorderMode", {BorderMode::CONSTANT, BorderMode::REPLICATE})),
+                       framework::dataset::make("NonMaximaSuppression", {false, true})))
+{
+    // Validate output
+    validate_detection_windows(_target.begin(), _target.end(), _reference.begin(), _reference.end(), tolerance);
+}
+
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp
index 96dd6f8..50081f0 100644
--- a/tests/validation/NEON/Im2Col.cpp
+++ b/tests/validation/NEON/Im2Col.cpp

@@ -23,10 +23,13 @@
  */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/Im2ColFixture.h"
 
 namespace arm_compute
 {
@@ -34,6 +37,12 @@
 {
 namespace validation
 {
+namespace
+{
+const auto conv_args = combine(combine(combine(framework::dataset::make("KernelDims", { Size2D(3U, 3U), Size2D(5U, 5U) }), framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) })),
+                                       framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
+                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }));
+} // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(Im2Col)
 
@@ -45,7 +54,7 @@
                                                        TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::QS8, 2),  // Mismatching fixed point
                                                        TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::QASYMM8), // Bias not supported with QASYMM8
                                                        TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::QASYMM8), // Mismatching shapes
-                                                       TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(10U, 12U, 2U, 2U), 1, DataType::QASYMM8),
                                                      }),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(3U, 4U, 10U, 2U), 1, DataType::F16),
                                                        TensorInfo(TensorShape(3U, 4U, 10U, 2U), 1, DataType::F16),
@@ -58,12 +67,49 @@
                framework::dataset::make("Expected", { false, false, false, false, false, true })),
                input_info, output_info, has_bias, expected)
 {
-    bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias, false));
+    bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias, false, false));
     ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
 // *INDENT-ON*
 
+template <typename T>
+using NEIm2ColFixture = Im2ColValidationFixture<Tensor, Accessor, NEIm2Col, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)),
+                                                                                              conv_args))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
+                                                                                             conv_args))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE_END()
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                conv_args))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END()
+
 TEST_SUITE_END()
 TEST_SUITE_END()
 } // namespace validation

diff --git a/tests/validation/NEON/L2NormalizeLayer.cpp b/tests/validation/NEON/L2NormalizeLayer.cpp
index c0f5920..f868ade 100644
--- a/tests/validation/NEON/L2NormalizeLayer.cpp
+++ b/tests/validation/NEON/L2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,6 +49,37 @@
 TEST_SUITE(NEON)
 TEST_SUITE(L2NormalizeLayer)
 
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+    framework::dataset::make("InputInfo",  { TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Mismatching data type input/output
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Mismatching shape input/output
+                                             TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F32
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 0
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
+                                           }),
+    framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(128U, 64U), 1, DataType::F16),
+                                             TensorInfo(TensorShape(256U, 64U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::S16),
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
+                                           })),
+    framework::dataset::make("Axis",       { 0U, 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 1U, 0U })),
+    framework::dataset::make("Expected",   { false, false, false, false, false, false, true })),
+    input_info, output_info, axis, expected)
+{
+    bool is_valid = bool(NEL2NormalizeLayer::validate(&input_info.clone()->set_is_resizable(false),
+                                                      &output_info.clone()->set_is_resizable(false),
+                                                      axis));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
 template <typename T>
 using NEL2NormalizeLayerFixture = L2NormalizeLayerValidationFixture<Tensor, Accessor, NEL2NormalizeLayer, T>;
 

diff --git a/tests/validation/NEON/LocallyConnected.cpp b/tests/validation/NEON/LocallyConnected.cpp
new file mode 100644
index 0000000..0c36ff6
--- /dev/null
+++ b/tests/validation/NEON/LocallyConnected.cpp

@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/LocallyConnectedDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/LocallyConnectedFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+constexpr RelativeTolerance<float> tolerance_f32(0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(LocallyConnected)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+    framework::dataset::make("InputInfo",  { TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching data type input/weights
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching data type input/bias
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching data type input/output
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching shape input/weights
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching shape input/bias
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Mismatching shape input/output
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0), // Asymmetric padding
+                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32, 0)
+                                           }),
+    framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F16, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 274U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32, 0)
+                                           })),
+    framework::dataset::make("BiasInfo",   { TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F16, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 274U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32, 0)
+                                           })),
+    framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F16, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 22U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
+                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0)
+                                           })),
+    framework::dataset::make("PadStride",  { PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 0, 0),
+                                             PadStrideInfo(2, 1, 1, 0, 0, 0, DimensionRoundingType::FLOOR),
+                                             PadStrideInfo(2, 1, 0, 0)
+                                           })),
+    framework::dataset::make("Expected", { false, false, false, false, false, false, false, true })),
+    input_info, weights_info, bias_info, output_info, conv_info, expected)
+{
+    bool is_valid = bool(NELocallyConnectedLayer::validate(&input_info.clone()->set_is_resizable(false),
+                                                           &weights_info.clone()->set_is_resizable(false),
+                                                           &bias_info.clone()->set_is_resizable(false),
+                                                           &output_info.clone()->set_is_resizable(false),
+                                                           conv_info));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallLocallyConnectedDataset(), datasets::LargeLocallyConnectedDataset()),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+               src_shape, weights_shape, bias_shape, dst_shape, info, dilation, data_type)
+{
+    ARM_COMPUTE_UNUSED(dilation);
+
+    // Create tensors
+    Tensor src     = create_tensor<Tensor>(src_shape, data_type);
+    Tensor weights = create_tensor<Tensor>(weights_shape, data_type);
+    Tensor bias    = create_tensor<Tensor>(bias_shape, data_type);
+    Tensor dst     = create_tensor<Tensor>(dst_shape, data_type);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function.
+    NELocallyConnectedLayer lc;
+    lc.configure(&src, &weights, &bias, &dst, info);
+
+    // Validate valid region
+    const ValidRegion dst_valid_region = shape_to_valid_region(dst_shape);
+    validate(dst.info()->valid_region(), dst_valid_region);
+}
+
+template <typename T>
+using NELocallyConnectedFixture = LocallyConnectedValidationFixture<Tensor, Accessor, NELocallyConnectedLayer, T>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NELocallyConnectedFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallLocallyConnectedDataset(),
+                                                                                                              framework::dataset::make("DataType",
+                                                                                                                      DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NELocallyConnectedFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::LargeLocallyConnectedDataset(),
+                                                                                                              framework::dataset::make("DataType",
+                                                                                                                      DataType::F32)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/NEON/OpticalFlow.cpp b/tests/validation/NEON/OpticalFlow.cpp
new file mode 100644
index 0000000..1f4bf5f
--- /dev/null
+++ b/tests/validation/NEON/OpticalFlow.cpp

@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
+#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
+#include "arm_compute/runtime/Pyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/NEON/ArrayAccessor.h"
+#include "tests/datasets/BorderModeDataset.h"
+#include "tests/datasets/OpticalFlowDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/OpticalFlowFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+TEST_SUITE(OpticalFlow)
+
+// *INDENT-OFF*
+// clang-format off
+using NEOpticalFlowFixture = OpticalFlowValidationFixture<Tensor,
+                                                          Accessor,
+                                                          KeyPointArray,
+                                                          ArrayAccessor<KeyPoint>,
+                                                          NEOpticalFlow,
+                                                          Pyramid,
+                                                          NEGaussianPyramidHalf,
+                                                          uint8_t>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEOpticalFlowFixture, framework::DatasetMode::PRECOMMIT, combine(combine(
+                       datasets::SmallOpticalFlowDataset(),
+                       framework::dataset::make("Format", Format::U8)),
+                       datasets::BorderModes()))
+{
+    // Validate output
+    ArrayAccessor<KeyPoint> array(_target);
+    validate_keypoints(array.buffer(),
+                       array.buffer() + array.num_values(),
+                       _reference.begin(),
+                       _reference.end());
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEOpticalFlowFixture, framework::DatasetMode::NIGHTLY, combine(combine(
+                       datasets::LargeOpticalFlowDataset(),
+                       framework::dataset::make("Format", Format::U8)),
+                       datasets::BorderModes()))
+{
+    // Validate output
+    ArrayAccessor<KeyPoint> array(_target);
+
+    validate_keypoints(array.buffer(),
+                       array.buffer() + array.num_values(),
+                       _reference.begin(),
+                       _reference.end());
+}
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index 44b4ff2..9304c8b 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,6 +117,8 @@
 using NEPixelWiseMultiplicationToQS8Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, qint8_t>;
 template <typename T>
 using NEPixelWiseMultiplicationToQS16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, qint16_t>;
+template <typename T>
+using NEPixelWiseMultiplicationBroadcastFixture = PixelWiseMultiplicationBroadcastValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
 
 TEST_SUITE(NEON)
 TEST_SUITE(PixelWiseMultiplication)
@@ -262,6 +264,10 @@
 
 TEST_SUITE_END() // F32toF32
 
+TEST_SUITE(Broadcast)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture<float>, PRECOMMIT, SmallShapesBroadcast(), F32, F32, scale_255, TO_NEAREST_UP, VALIDATE(float, 1.f))
+TEST_SUITE_END() // Broadcast
+
 TEST_SUITE_END()
 TEST_SUITE_END()
 } // namespace validation

diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 2a86c10..165f4d6 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp

@@ -59,7 +59,7 @@
 
 const auto PoolingLayerDatasetQASYMM8 = combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(4, 4), Size2D(9, 9), Size2D(3, 7), Size2D(7, 8) })),
                                                         framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })),
-                                                framework::dataset::make("ExcludePadding", { true, false }));
+                                                framework::dataset::make("ExcludePadding", { true }));
 
 constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for float types */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -128,37 +128,42 @@
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType",
-                                                                                                    DataType::F32))))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType",
+                                                                                                            DataType::F32))),
+                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType",
-                                                                                                        DataType::F32))))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                                framework::dataset::make("DataType",
+                                                                                                                        DataType::F32))),
+                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP,
-                                                                                                   framework::dataset::make("DataType", DataType::F16))))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                           framework::dataset::make("DataType", DataType::F16))),
+                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
-                                                                                                       framework::dataset::make("DataType", DataType::F16))))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                               framework::dataset::make("DataType", DataType::F16))),
+                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-TEST_SUITE_END()
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-TEST_SUITE_END()
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+TEST_SUITE_END() // Float
 
 template <typename T>
 using NEPoolingLayerFixedPointFixture = PoolingLayerValidationFixedPointFixture<Tensor, Accessor, NEPoolingLayer, T>;
@@ -179,7 +184,7 @@
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qs8);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // QS8
 
 TEST_SUITE(QS16)
 FIXTURE_DATA_TEST_CASE(RunTiny, NEPoolingLayerFixedPointFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(datasets::TinyShapes(), combine(PoolingLayerDatasetQS,
@@ -196,8 +201,8 @@
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qs16);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // QS16
+TEST_SUITE_END() // FixedPoint
 
 TEST_SUITE(Quantized)
 
@@ -205,27 +210,28 @@
 using NEPoolingLayerQuantizedFixture = PoolingLayerValidationQuantizedFixture<Tensor, Accessor, NEPoolingLayer, T>;
 
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8,
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetQASYMM8,
                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8))),
-                                                                                                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127),
-                                                                                                                       QuantizationInfo(7.f / 255, 123)
-                                                                                                                                                            })))
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 127),
+                                                                                                                               QuantizationInfo(7.f / 255, 123)
+                                                                                                                                                                    })),
+                                                                                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetQASYMM8,
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), combine(PoolingLayerDatasetQASYMM8,
                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8))),
-                                                                                                                   framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) })))
+                                                                                                                   framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) })),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // PoolingLayer
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute

diff --git a/tests/validation/NEON/ReductionOperation.cpp b/tests/validation/NEON/ReductionOperation.cpp
index cf603c6..c2f2909 100644
--- a/tests/validation/NEON/ReductionOperation.cpp
+++ b/tests/validation/NEON/ReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,6 +50,36 @@
 TEST_SUITE(NEON)
 TEST_SUITE(ReductionOperation)
 
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+    framework::dataset::make("InputInfo",          { TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Mismatching data type input/output
+                                                     TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F32
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 0
+                                                     TensorInfo(TensorShape(128U, 64U), 1, DataType::F32)
+                                                   }),
+    framework::dataset::make("OutputInfo",         { TensorInfo(TensorShape(1U, 64U), 1, DataType::F16),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::S16),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::F32),
+                                                     TensorInfo(TensorShape(1U, 64U), 1, DataType::F32)
+                                                   })),
+    framework::dataset::make("Axis",               { 0U, 0U, 0U, static_cast<unsigned int>(TensorShape::num_max_dimensions), 1U, 0U })),
+    framework::dataset::make("Expected",           { false, false, false, false, false, true })),
+    input_info, output_info, axis, expected)
+{
+    bool is_valid = bool(NEReductionOperation::validate(&input_info.clone()->set_is_resizable(false),
+                                                        &output_info.clone()->set_is_resizable(true),
+                                                        axis,
+                                                        ReductionOperation::SUM_SQUARE));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
 template <typename T>
 using NEReductionOperationFixture = ReductionOperationValidationFixture<Tensor, Accessor, NEReductionOperation, T>;
 

diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index b92162e..8940259 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,6 +55,13 @@
     DataType::F32,
 });
 
+/** Scale data types */
+const auto ScaleDataLayouts = framework::dataset::make("DataLayout",
+{
+    DataLayout::NCHW,
+    DataLayout::NHWC,
+});
+
 /** Tolerance */
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
 constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
@@ -67,29 +74,93 @@
 TEST_SUITE(NEON)
 TEST_SUITE(Scale)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), ScaleDataTypes),
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8, 0),  // Mismatching data type
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0), // Unsupported sampling point
+                                                TensorInfo(TensorShape(4U, 27U, 13U), 1, DataType::F32, 0), // Invalid policy
+                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0), // Insufficient padding
+                                                TensorInfo(TensorShape(4U, 27U, 13U), 1, DataType::F32, 0),
+                                              }),
+        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(132U, 25U, 2U), 1, DataType::F32, 0),
+                                                TensorInfo(TensorShape(132U, 25U, 2U), 1, DataType::F32, 0),
+                                                TensorInfo(TensorShape(4U, 132U, 25U), 1, DataType::F32, 0),
+                                                TensorInfo(TensorShape(132U, 25U, 2U), 1, DataType::F32, 0),
+                                                TensorInfo(TensorShape(4U, 132U, 25U), 1, DataType::F32, 0),
+                                              })),
+        framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR,
+                                                          InterpolationPolicy::NEAREST_NEIGHBOR,
+                                                          InterpolationPolicy::AREA,
+                                                          InterpolationPolicy::AREA,
+                                                          InterpolationPolicy::NEAREST_NEIGHBOR,
+                                                        })),
+        framework::dataset::make("BorderMode",  { BorderMode::UNDEFINED,
+                                                  BorderMode::UNDEFINED,
+                                                  BorderMode::UNDEFINED,
+                                                  BorderMode::UNDEFINED,
+                                                  BorderMode::REPLICATE,
+                                                })),
+        framework::dataset::make("SamplingPolicy",  { SamplingPolicy::CENTER,
+                                                      SamplingPolicy::TOP_LEFT,
+                                                      SamplingPolicy::CENTER,
+                                                      SamplingPolicy::CENTER,
+                                                      SamplingPolicy::CENTER,
+                                                    })),
+        framework::dataset::make("DataLayout",  { DataLayout::NCHW,
+                                                  DataLayout::NCHW,
+                                                  DataLayout::NHWC,
+                                                  DataLayout::NCHW,
+                                                  DataLayout::NHWC,
+                                                })),
+        framework::dataset::make("Expected", { false, false, false, false ,true })),
+        input_info, output_info, policy,border_mode, sampling_policy, data_layout, expected)
+{
+    const PixelValue constant_border(5);
+    Status status = NEScale::validate(&input_info.clone()->set_is_resizable(false).set_data_layout(data_layout),
+                                           &output_info.clone()->set_is_resizable(false).set_data_layout(data_layout),
+                                           policy, border_mode, constant_border, sampling_policy);
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), ScaleDataTypes), ScaleDataLayouts),
                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                            datasets::BorderModes()),
                                                                    framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })),
-               shape, data_type, policy, border_mode, sampling_policy)
+               shape, data_type, data_layout, policy, border_mode, sampling_policy)
 {
     std::mt19937                          generator(library->seed());
     std::uniform_real_distribution<float> distribution_float(0.25, 2);
     const float                           scale_x               = distribution_float(generator);
     const float                           scale_y               = distribution_float(generator);
     uint8_t                               constant_border_value = 0;
+    TensorShape                           src_shape             = shape;
     if(border_mode == BorderMode::CONSTANT)
     {
         std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
         constant_border_value = distribution_u8(generator);
     }
 
+    // Get width/height indices depending on layout
+    const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Change shape in case of NHWC.
+    if(data_layout == DataLayout::NHWC)
+    {
+        permute(src_shape, PermutationVector(2U, 0U, 1U));
+    }
+
+    // Calculate scaled shape
+    TensorShape shape_scaled(src_shape);
+    shape_scaled.set(idx_width, src_shape[idx_width] * scale_x);
+    shape_scaled.set(idx_height, src_shape[idx_height] * scale_y);
+
     // Create tensors
-    Tensor      src = create_tensor<Tensor>(shape, data_type);
-    TensorShape shape_scaled(shape);
-    shape_scaled.set(0, shape[0] * scale_x);
-    shape_scaled.set(1, shape[1] * scale_y);
-    Tensor dst = create_tensor<Tensor>(shape_scaled, data_type);
+    Tensor src = create_tensor<Tensor>(src_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+    Tensor dst = create_tensor<Tensor>(shape_scaled, data_type, 1, 0, QuantizationInfo(), data_layout);
 
     ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -99,15 +170,27 @@
     nescale.configure(&src, &dst, policy, border_mode, constant_border_value, sampling_policy);
 
     // Validate valid region
-    const ValidRegion dst_valid_region = calculate_valid_region_scale(*(src.info()), shape_scaled, policy, BorderSize(1), (border_mode == BorderMode::UNDEFINED));
-
+    const ValidRegion dst_valid_region = calculate_valid_region_scale(*(src.info()), shape_scaled, policy, sampling_policy, (border_mode == BorderMode::UNDEFINED));
     validate(dst.info()->valid_region(), dst_valid_region);
 
     // Validate padding
-    PaddingCalculator calculator(shape_scaled.x(), 16);
+    int num_elements_processed_x = 16;
+    if(data_layout == DataLayout::NHWC)
+    {
+        num_elements_processed_x = (policy == InterpolationPolicy::BILINEAR) ? 1 : 16 / src.info()->element_size();
+    }
+    PaddingCalculator calculator(shape_scaled.x(), num_elements_processed_x);
     calculator.set_border_mode(border_mode);
 
-    const PaddingSize read_padding(1);
+    PaddingSize read_padding(1);
+    if(data_layout == DataLayout::NHWC)
+    {
+        read_padding = calculator.required_padding(PaddingCalculator::Option::EXCLUDE_BORDER);
+        if(border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR)
+        {
+            read_padding.top = 1;
+        }
+    }
     const PaddingSize write_padding = calculator.required_padding(PaddingCalculator::Option::EXCLUDE_BORDER);
     validate(src.info()->padding(), read_padding);
     validate(dst.info()->padding(), write_padding);
@@ -118,28 +201,30 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
                                                                                                                      DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                              framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                      datasets::BorderModes()),
                                                                                              framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
-    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
                                                                                                                  DataType::F32)),
+                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                  framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                          datasets::BorderModes()),
                                                                                                  framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
-    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
@@ -149,56 +234,60 @@
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
                                                                                                                        DataType::U8)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                        datasets::BorderModes()),
                                                                                                framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
-    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_u8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
                                                                                                                    DataType::U8)),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                            datasets::BorderModes()),
                                                                                                    framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
-    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_u8);
 }
 TEST_SUITE_END()
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
                                                                                                                        DataType::S16)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                        datasets::BorderModes()),
                                                                                                framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
-    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_s16, tolerance_num_s16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
                                                                                                                    DataType::S16)),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                            datasets::BorderModes()),
                                                                                                    framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
-    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_s16, tolerance_num_s16);

diff --git a/tests/validation/NEON/UNIT/TensorAllocator.cpp b/tests/validation/NEON/UNIT/TensorAllocator.cpp
index 4732f3f..872054f 100644
--- a/tests/validation/NEON/UNIT/TensorAllocator.cpp
+++ b/tests/validation/NEON/UNIT/TensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/TensorAllocator.h"
 
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryRegion.h"
 #include "support/ToolchainSupport.h"
 #include "tests/Utils.h"
 #include "tests/framework/Asserts.h"
@@ -45,10 +46,7 @@
     TensorInfo info(TensorShape(24U, 16U, 3U), 1, DataType::F32);
 
     // Allocate memory buffer
-    std::shared_ptr<uint8_t> buf(new uint8_t[info.total_size()](), [](uint8_t *ptr)
-    {
-        delete[] ptr;
-    });
+    auto buf = std::make_shared<MemoryRegion>(info.total_size());
 
     // Negative case : Import empty memory
     Tensor t1;
@@ -68,7 +66,7 @@
     t3.allocator()->init(info);
     ARM_COMPUTE_EXPECT(bool(t3.allocator()->import_memory(Memory(buf.get()))), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(!t3.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t3.buffer() == buf.get(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(t3.buffer() == reinterpret_cast<uint8_t *>(buf->buffer()), framework::LogLevel::ERRORS);
     t3.allocator()->free();
     ARM_COMPUTE_EXPECT(t3.info()->is_resizable(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(t3.buffer() == nullptr, framework::LogLevel::ERRORS);
@@ -78,7 +76,7 @@
     t4.allocator()->init(info);
     ARM_COMPUTE_EXPECT(bool(t4.allocator()->import_memory(Memory(buf))), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(!t4.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(t4.buffer() == buf.get(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(t4.buffer() == reinterpret_cast<uint8_t *>(buf->buffer()), framework::LogLevel::ERRORS);
     t4.allocator()->free();
     ARM_COMPUTE_EXPECT(t4.info()->is_resizable(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(t4.buffer() == nullptr, framework::LogLevel::ERRORS);

diff --git a/tests/validation/UNIT/FixedPoint.cpp b/tests/validation/UNIT/FixedPoint.cpp
index 251f5a8..5c39b20 100644
--- a/tests/validation/UNIT/FixedPoint.cpp
+++ b/tests/validation/UNIT/FixedPoint.cpp

@@ -100,10 +100,6 @@
     }
 }
 
-// The last input argument specifies the expected number of failures for a
-// given combination of (function name, number of fractional bits) as defined
-// by the first two arguments.
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(FixedPointQS8Outputs, framework::DatasetMode::ALL, zip(combine(

diff --git a/tests/validation/UNIT/GPUTarget.cpp b/tests/validation/UNIT/GPUTarget.cpp
new file mode 100644
index 0000000..62e0b1d
--- /dev/null
+++ b/tests/validation/UNIT/GPUTarget.cpp

@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GPUTarget.h"
+#include "support/ToolchainSupport.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/Utils.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(UNIT)
+TEST_SUITE(GPUTarget)
+
+TEST_CASE(GetGPUTargetFromName, framework::DatasetMode::ALL)
+{
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-T600") == GPUTarget::T600, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-T700") == GPUTarget::T700, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-T800") == GPUTarget::T800, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G71") == GPUTarget::G71, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G72") == GPUTarget::G72, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G51") == GPUTarget::G51, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G51BIG") == GPUTarget::G51BIG, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-G51LIT") == GPUTarget::G51LIT, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-TNOX") == GPUTarget::TNOX, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-TTRX") == GPUTarget::TTRX, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-TBOX") == GPUTarget::TBOX, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(get_target_from_name("Mali-T000") == GPUTarget::MIDGARD, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(GPUTargetIsIn, framework::DatasetMode::ALL)
+{
+    ARM_COMPUTE_EXPECT(!gpu_target_is_in(GPUTarget::G71, GPUTarget::T600, GPUTarget::T800, GPUTarget::G72), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(gpu_target_is_in(GPUTarget::G71, GPUTarget::T600, GPUTarget::T800, GPUTarget::G71), framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // CLHelpers
+TEST_SUITE_END() // UNIT
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/Validation.h b/tests/validation/Validation.h
index 5063180..ac3643e 100644
--- a/tests/validation/Validation.h
+++ b/tests/validation/Validation.h

@@ -69,7 +69,10 @@
     {
     }
 
-    /** Implicit conversion to the underlying type. */
+    /** Implicit conversion to the underlying type.
+     *
+     * @return the underlying type.
+     */
     constexpr operator T() const
     {
         return _value;
@@ -102,7 +105,10 @@
     {
     }
 
-    /** Implicit conversion to the underlying type. */
+    /** Implicit conversion to the underlying type.
+     *
+     * @return the underlying type.
+     */
     constexpr operator value_type() const
     {
         return _value;
@@ -131,19 +137,45 @@
 }
 
 template <typename T>
-bool compare_dimensions(const Dimensions<T> &dimensions1, const Dimensions<T> &dimensions2)
+bool compare_dimensions(const Dimensions<T> &dimensions1, const Dimensions<T> &dimensions2, const DataLayout &data_layout = DataLayout::NCHW)
 {
-    if(dimensions1.num_dimensions() != dimensions2.num_dimensions())
-    {
-        return false;
-    }
+    ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
-    for(unsigned int i = 0; i < dimensions1.num_dimensions(); ++i)
+    if(data_layout == DataLayout::NCHW)
     {
-        if(dimensions1[i] != dimensions2[i])
+        if(dimensions1.num_dimensions() != dimensions2.num_dimensions())
         {
             return false;
         }
+
+        for(unsigned int i = 0; i < dimensions1.num_dimensions(); ++i)
+        {
+            if(dimensions1[i] != dimensions2[i])
+            {
+                return false;
+            }
+        }
+    }
+    else
+    {
+        // In case a 2D shape becomes 3D after permutation, the permuted tensor will have one dimension more and the first value will be 1
+        if((dimensions1.num_dimensions() != dimensions2.num_dimensions()) && ((dimensions1.num_dimensions() != (dimensions2.num_dimensions() + 1)) || (dimensions1.x() != 1)))
+        {
+            return false;
+        }
+
+        if((dimensions1[0] != dimensions2[2]) || (dimensions1[1] != dimensions2[0]) || (dimensions1[2] != dimensions2[1]))
+        {
+            return false;
+        }
+
+        for(unsigned int i = 3; i < dimensions1.num_dimensions(); ++i)
+        {
+            if(dimensions1[i] != dimensions2[i])
+            {
+                return false;
+            }
+        }
     }
 
     return true;
@@ -241,27 +273,40 @@
 void validate_keypoints(T target_first, T target_last, U reference_first, U reference_last, V tolerance = AbsoluteTolerance<float>(),
                         float allowed_missing_percentage = 5.f, float allowed_mismatch_percentage = 5.f);
 
+/** Validate detection windows. */
+template <typename T, typename U, typename V = AbsoluteTolerance<float>>
+void validate_detection_windows(T target_first, T target_last, U reference_first, U reference_last, V tolerance = AbsoluteTolerance<float>(),
+                                float allowed_missing_percentage = 5.f, float allowed_mismatch_percentage = 5.f);
+
 template <typename T>
 struct compare_base
 {
+    /** Construct a comparison object.
+     *
+     * @param[in] target    Target value.
+     * @param[in] reference Reference value.
+     * @param[in] tolerance Allowed tolerance.
+     */
     compare_base(typename T::value_type target, typename T::value_type reference, T tolerance = T(0))
         : _target{ target }, _reference{ reference }, _tolerance{ tolerance }
     {
     }
 
-    typename T::value_type _target{};
-    typename T::value_type _reference{};
-    T                      _tolerance{};
+    typename T::value_type _target{};    /**< Target value */
+    typename T::value_type _reference{}; /**< Reference value */
+    T                      _tolerance{}; /**< Tolerance value */
 };
 
 template <typename T>
 struct compare;
 
+/** Compare values with an absolute tolerance */
 template <typename U>
 struct compare<AbsoluteTolerance<U>> : public compare_base<AbsoluteTolerance<U>>
 {
     using compare_base<AbsoluteTolerance<U>>::compare_base;
 
+    /** Perform comparison */
     operator bool() const
     {
         if(!support::cpp11::isfinite(this->_target) || !support::cpp11::isfinite(this->_reference))
@@ -281,11 +326,13 @@
     }
 };
 
+/** Compare values with a relative tolerance */
 template <typename U>
 struct compare<RelativeTolerance<U>> : public compare_base<RelativeTolerance<U>>
 {
     using compare_base<RelativeTolerance<U>>::compare_base;
 
+    /** Perform comparison */
     operator bool() const
     {
         if(!support::cpp11::isfinite(this->_target) || !support::cpp11::isfinite(this->_reference))
@@ -310,7 +357,7 @@
                 return false;
             }
 
-            const double relative_change = std::abs(static_cast<double>(this->_target) - static_cast<double>(this->_reference)) / this->_reference;
+            const double relative_change = std::abs((static_cast<double>(this->_target) - static_cast<double>(this->_reference)) / this->_reference);
 
             return relative_change <= static_cast<U>(this->_tolerance);
         }
@@ -321,14 +368,14 @@
 void validate(const IAccessor &tensor, const SimpleTensor<T> &reference, U tolerance_value, float tolerance_number, float absolute_tolerance_value)
 {
     // Validate with valid region covering the entire shape
-    validate(tensor, reference, shape_to_valid_region(tensor.shape()), tolerance_value, tolerance_number, absolute_tolerance_value);
+    validate(tensor, reference, shape_to_valid_region(reference.shape()), tolerance_value, tolerance_number, absolute_tolerance_value);
 }
 
 template <typename T, typename U, typename = typename std::enable_if<std::is_integral<T>::value>::type>
 void validate_wrap(const IAccessor &tensor, const SimpleTensor<T> &reference, U tolerance_value, float tolerance_number)
 {
     // Validate with valid region covering the entire shape
-    validate_wrap(tensor, reference, shape_to_valid_region(tensor.shape()), tolerance_value, tolerance_number);
+    validate_wrap(tensor, reference, shape_to_valid_region(reference.shape()), tolerance_value, tolerance_number);
 }
 
 template <typename T, typename U>
@@ -346,7 +393,7 @@
     }
 
     ARM_COMPUTE_EXPECT_EQUAL(tensor.num_channels(), reference.num_channels(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(compare_dimensions(tensor.shape(), reference.shape()), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(compare_dimensions(tensor.shape(), reference.shape(), tensor.data_layout()), framework::LogLevel::ERRORS);
 
     const int min_elements = std::min(tensor.num_elements(), reference.num_elements());
     const int min_channels = std::min(tensor.num_channels(), reference.num_channels());
@@ -356,12 +403,18 @@
     {
         const Coordinates id = index2coord(reference.shape(), element_idx);
 
+        Coordinates target_id(id);
+        if(tensor.data_layout() == DataLayout::NHWC)
+        {
+            permute(target_id, PermutationVector(2U, 0U, 1U));
+        }
+
         if(is_in_valid_region(valid_region, id))
         {
             // Iterate over all channels within one element
             for(int c = 0; c < min_channels; ++c)
             {
-                const T &target_value    = reinterpret_cast<const T *>(tensor(id))[c];
+                const T &target_value    = reinterpret_cast<const T *>(tensor(target_id))[c];
                 const T &reference_value = reinterpret_cast<const T *>(reference(id))[c];
 
                 if(!compare<U>(target_value, reference_value, tolerance_value))
@@ -415,7 +468,7 @@
     }
 
     ARM_COMPUTE_EXPECT_EQUAL(tensor.num_channels(), reference.num_channels(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(compare_dimensions(tensor.shape(), reference.shape()), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(compare_dimensions(tensor.shape(), reference.shape(), tensor.data_layout()), framework::LogLevel::ERRORS);
 
     const int min_elements = std::min(tensor.num_elements(), reference.num_elements());
     const int min_channels = std::min(tensor.num_channels(), reference.num_channels());
@@ -425,12 +478,18 @@
     {
         const Coordinates id = index2coord(reference.shape(), element_idx);
 
+        Coordinates target_id(id);
+        if(tensor.data_layout() == DataLayout::NHWC)
+        {
+            permute(target_id, PermutationVector(2U, 0U, 1U));
+        }
+
         if(is_in_valid_region(valid_region, id))
         {
             // Iterate over all channels within one element
             for(int c = 0; c < min_channels; ++c)
             {
-                const T &target_value    = reinterpret_cast<const T *>(tensor(id))[c];
+                const T &target_value    = reinterpret_cast<const T *>(tensor(target_id))[c];
                 const T &reference_value = reinterpret_cast<const T *>(reference(id))[c];
 
                 bool equal = compare<U>(target_value, reference_value, tolerance_value);
@@ -497,7 +556,7 @@
     }
 
     ARM_COMPUTE_EXPECT_EQUAL(tensor.num_channels(), reference.num_channels(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(compare_dimensions(tensor.shape(), reference.shape()), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(compare_dimensions(tensor.shape(), reference.shape(), tensor.data_layout()), framework::LogLevel::ERRORS);
 
     const int min_elements = std::min(tensor.num_elements(), reference.num_elements());
     const int min_channels = std::min(tensor.num_channels(), reference.num_channels());
@@ -507,12 +566,18 @@
     {
         const Coordinates id = index2coord(reference.shape(), element_idx);
 
+        Coordinates target_id(id);
+        if(tensor.data_layout() == DataLayout::NHWC)
+        {
+            permute(target_id, PermutationVector(2U, 0U, 1U));
+        }
+
         if(valid_mask[element_idx] == 1)
         {
             // Iterate over all channels within one element
             for(int c = 0; c < min_channels; ++c)
             {
-                const T &target_value    = reinterpret_cast<const T *>(tensor(id))[c];
+                const T &target_value    = reinterpret_cast<const T *>(tensor(target_id))[c];
                 const T &reference_value = reinterpret_cast<const T *>(reference(id))[c];
 
                 if(!compare<U>(target_value, reference_value, tolerance_value))
@@ -716,6 +781,77 @@
     }
 }
 
+/** Check which detection windows from [first1, last1) are missing in [first2, last2) */
+template <typename T, typename U, typename V>
+std::pair<int64_t, int64_t> compare_detection_windows(T first1, T last1, U first2, U last2, V tolerance)
+{
+    int64_t num_missing    = 0;
+    int64_t num_mismatches = 0;
+
+    while(first1 != last1)
+    {
+        const auto window = std::find_if(first2, last2, [&](DetectionWindow window)
+        {
+            return window.x == first1->x && window.y == first1->y && window.width == first1->width && window.height == first1->height && window.idx_class == first1->idx_class;
+        });
+
+        if(window == last2)
+        {
+            ++num_missing;
+            ARM_COMPUTE_TEST_INFO("Detection window not found " << *first1)
+        }
+        else
+        {
+            if(!compare<V>(window->score, first1->score, tolerance))
+            {
+                ++num_mismatches;
+                ARM_COMPUTE_TEST_INFO("Mismatching detection window")
+                ARM_COMPUTE_TEST_INFO("detection window 1= " << *first1)
+                ARM_COMPUTE_TEST_INFO("detection window 2= " << *window)
+            }
+        }
+
+        ++first1;
+    }
+
+    return std::make_pair(num_missing, num_mismatches);
+}
+
+template <typename T, typename U, typename V>
+void validate_detection_windows(T target_first, T target_last, U reference_first, U reference_last, V tolerance,
+                                float allowed_missing_percentage, float allowed_mismatch_percentage)
+{
+    const int64_t num_elements_target    = std::distance(target_first, target_last);
+    const int64_t num_elements_reference = std::distance(reference_first, reference_last);
+
+    int64_t num_missing    = 0;
+    int64_t num_mismatches = 0;
+
+    if(num_elements_reference > 0)
+    {
+        std::tie(num_missing, num_mismatches) = compare_detection_windows(reference_first, reference_last, target_first, target_last, tolerance);
+
+        const float percent_missing    = static_cast<float>(num_missing) / num_elements_reference * 100.f;
+        const float percent_mismatches = static_cast<float>(num_mismatches) / num_elements_reference * 100.f;
+
+        ARM_COMPUTE_TEST_INFO(num_missing << " detection windows (" << std::fixed << std::setprecision(2) << percent_missing << "%) are missing in target");
+        ARM_COMPUTE_EXPECT(percent_missing <= allowed_missing_percentage, framework::LogLevel::ERRORS);
+
+        ARM_COMPUTE_TEST_INFO(num_mismatches << " detection windows (" << std::fixed << std::setprecision(2) << percent_mismatches << "%) mismatched");
+        ARM_COMPUTE_EXPECT(percent_mismatches <= allowed_mismatch_percentage, framework::LogLevel::ERRORS);
+    }
+
+    if(num_elements_target > 0)
+    {
+        std::tie(num_missing, num_mismatches) = compare_detection_windows(target_first, target_last, reference_first, reference_last, tolerance);
+
+        const float percent_missing = static_cast<float>(num_missing) / num_elements_target * 100.f;
+
+        ARM_COMPUTE_TEST_INFO(num_missing << " detection windows (" << std::fixed << std::setprecision(2) << percent_missing << "%) are not part of target");
+        ARM_COMPUTE_EXPECT(percent_missing <= allowed_missing_percentage, framework::LogLevel::ERRORS);
+    }
+}
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute

diff --git a/tests/validation/fixtures/BatchNormalizationLayerFixture.h b/tests/validation/fixtures/BatchNormalizationLayerFixture.h
index e02c619..b7e32a6 100644
--- a/tests/validation/fixtures/BatchNormalizationLayerFixture.h
+++ b/tests/validation/fixtures/BatchNormalizationLayerFixture.h

@@ -45,12 +45,15 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape shape0, TensorShape shape1, float epsilon, ActivationLayerInfo act_info, DataType dt, int fractional_bits)
+    void setup(TensorShape shape0, TensorShape shape1, float epsilon, bool use_beta, bool use_gamma, ActivationLayerInfo act_info, DataType dt, DataLayout data_layout, int fractional_bits)
     {
         _fractional_bits = fractional_bits;
         _data_type       = dt;
-        _target          = compute_target(shape0, shape1, epsilon, act_info, dt, fractional_bits);
-        _reference       = compute_reference(shape0, shape1, epsilon, act_info, dt, fractional_bits);
+        _use_beta        = use_beta;
+        _use_gamma       = use_gamma;
+
+        _target    = compute_target(shape0, shape1, epsilon, act_info, dt, data_layout, fractional_bits);
+        _reference = compute_reference(shape0, shape1, epsilon, act_info, dt, fractional_bits);
     }
 
 protected:
@@ -67,8 +70,24 @@
             library->fill(src_tensor, distribution, 0);
             library->fill(mean_tensor, distribution, 1);
             library->fill(var_tensor, distribution_var, 0);
-            library->fill(beta_tensor, distribution, 3);
-            library->fill(gamma_tensor, distribution, 4);
+            if(_use_beta)
+            {
+                library->fill(beta_tensor, distribution, 3);
+            }
+            else
+            {
+                // Fill with default value 0.f
+                library->fill_tensor_value(beta_tensor, 0.f);
+            }
+            if(_use_gamma)
+            {
+                library->fill(gamma_tensor, distribution, 4);
+            }
+            else
+            {
+                // Fill with default value 1.f
+                library->fill_tensor_value(gamma_tensor, 1.f);
+            }
         }
         else
         {
@@ -80,16 +99,37 @@
             library->fill(src_tensor, distribution, 0);
             library->fill(mean_tensor, distribution, 1);
             library->fill(var_tensor, distribution_var, 0);
-            library->fill(beta_tensor, distribution, 3);
-            library->fill(gamma_tensor, distribution, 4);
+            if(_use_beta)
+            {
+                library->fill(beta_tensor, distribution, 3);
+            }
+            else
+            {
+                // Fill with default value 0
+                library->fill_tensor_value(beta_tensor, static_cast<T>(0));
+            }
+            if(_use_gamma)
+            {
+                library->fill(gamma_tensor, distribution, 4);
+            }
+            else
+            {
+                // Fill with default value 1
+                library->fill_tensor_value(gamma_tensor, static_cast<T>(1 << (_fractional_bits)));
+            }
         }
     }
 
-    TensorType compute_target(const TensorShape &shape0, const TensorShape &shape1, float epsilon, ActivationLayerInfo act_info, DataType dt, int fixed_point_position)
+    TensorType compute_target(TensorShape shape0, const TensorShape &shape1, float epsilon, ActivationLayerInfo act_info, DataType dt, DataLayout data_layout, int fixed_point_position)
     {
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(shape0, PermutationVector(2U, 0U, 1U));
+        }
+
         // Create tensors
-        TensorType src   = create_tensor<TensorType>(shape0, dt, 1, fixed_point_position);
-        TensorType dst   = create_tensor<TensorType>(shape0, dt, 1, fixed_point_position);
+        TensorType src   = create_tensor<TensorType>(shape0, dt, 1, fixed_point_position, QuantizationInfo(), data_layout);
+        TensorType dst   = create_tensor<TensorType>(shape0, dt, 1, fixed_point_position, QuantizationInfo(), data_layout);
         TensorType mean  = create_tensor<TensorType>(shape1, dt, 1, fixed_point_position);
         TensorType var   = create_tensor<TensorType>(shape1, dt, 1, fixed_point_position);
         TensorType beta  = create_tensor<TensorType>(shape1, dt, 1, fixed_point_position);
@@ -97,7 +137,9 @@
 
         // Create and configure function
         FunctionType norm;
-        norm.configure(&src, &dst, &mean, &var, &beta, &gamma, epsilon, act_info);
+        TensorType *beta_ptr  = _use_beta ? &beta : nullptr;
+        TensorType *gamma_ptr = _use_gamma ? &gamma : nullptr;
+        norm.configure(&src, &dst, &mean, &var, beta_ptr, gamma_ptr, epsilon, act_info);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -149,6 +191,8 @@
     SimpleTensor<T> _reference{};
     int             _fractional_bits{};
     DataType        _data_type{};
+    bool            _use_beta{};
+    bool            _use_gamma{};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -156,9 +200,9 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape shape0, TensorShape shape1, float epsilon, ActivationLayerInfo act_info, DataType dt)
+    void setup(TensorShape shape0, TensorShape shape1, float epsilon, bool use_beta, bool use_gamma, ActivationLayerInfo act_info, DataType dt, DataLayout data_layout)
     {
-        BatchNormalizationLayerValidationFixedPointFixture<TensorType, AccessorType, FunctionType, T>::setup(shape0, shape1, epsilon, act_info, dt, 0);
+        BatchNormalizationLayerValidationFixedPointFixture<TensorType, AccessorType, FunctionType, T>::setup(shape0, shape1, epsilon, use_beta, use_gamma, act_info, dt, data_layout, 0);
     }
 };
 } // namespace validation

diff --git a/tests/validation/fixtures/ChannelCombineFixture.h b/tests/validation/fixtures/ChannelCombineFixture.h
new file mode 100644
index 0000000..68d0237
--- /dev/null
+++ b/tests/validation/fixtures/ChannelCombineFixture.h

@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_CHANNEL_COMBINE_FIXTURE
+#define ARM_COMPUTE_TEST_CHANNEL_COMBINE_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/ChannelCombine.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+template <typename TensorType>
+inline std::vector<TensorType> create_tensor_planes(const TensorShape &shape, Format format)
+{
+    TensorShape image_shape = adjust_odd_shape(shape, format);
+    TensorInfo  info(image_shape, Format::U8);
+
+    std::vector<TensorType> tensor_planes;
+
+    switch(format)
+    {
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUV444:
+        {
+            tensor_planes.resize(3);
+
+            if(format == Format::RGBA8888)
+            {
+                tensor_planes.resize(4);
+            }
+
+            for(unsigned int plane_idx = 0; plane_idx < tensor_planes.size(); ++plane_idx)
+            {
+                tensor_planes[plane_idx].allocator()->init(info);
+            }
+
+            break;
+        }
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            const TensorShape uv_shape = calculate_subsampled_shape(image_shape, format);
+            const TensorInfo  info_hor2(uv_shape, Format::U8);
+
+            tensor_planes.resize(3);
+
+            tensor_planes[0].allocator()->init(info);
+            tensor_planes[1].allocator()->init(info_hor2);
+            tensor_planes[2].allocator()->init(info_hor2);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        {
+            const TensorShape sub2_shape = calculate_subsampled_shape(image_shape, format);
+            const TensorInfo  info_sub2(sub2_shape, Format::U8);
+
+            tensor_planes.resize(3);
+
+            tensor_planes[0].allocator()->init(info);
+            tensor_planes[1].allocator()->init(info_sub2);
+            tensor_planes[2].allocator()->init(info_sub2);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    return tensor_planes;
+}
+} // namespace
+
+template <typename MultiImageType, typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ChannelCombineValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, Format format)
+    {
+        _num_planes = num_planes_from_format(format);
+        _target     = compute_target(shape, format);
+        _reference  = compute_reference(shape, format);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        library->fill_tensor_uniform(tensor, i);
+    }
+
+    template <typename U>
+    std::vector<SimpleTensor<U>> create_tensor_planes_reference(const TensorShape &shape, Format format)
+    {
+        std::vector<SimpleTensor<U>> tensor_planes;
+
+        TensorShape image_shape = adjust_odd_shape(shape, format);
+
+        switch(format)
+        {
+            case Format::RGB888:
+            case Format::RGBA8888:
+            case Format::YUV444:
+            {
+                if(format == Format::RGBA8888)
+                {
+                    tensor_planes.emplace_back(image_shape, Format::U8);
+                }
+
+                tensor_planes.emplace_back(image_shape, Format::U8);
+                tensor_planes.emplace_back(image_shape, Format::U8);
+                tensor_planes.emplace_back(image_shape, Format::U8);
+                break;
+            }
+            case Format::YUYV422:
+            case Format::UYVY422:
+            {
+                const TensorShape hor2_shape = calculate_subsampled_shape(image_shape, format);
+
+                tensor_planes.emplace_back(image_shape, Format::U8);
+                tensor_planes.emplace_back(hor2_shape, Format::U8);
+                tensor_planes.emplace_back(hor2_shape, Format::U8);
+                break;
+            }
+            case Format::NV12:
+            case Format::NV21:
+            case Format::IYUV:
+            {
+                const TensorShape shape_sub2 = calculate_subsampled_shape(image_shape, format);
+
+                tensor_planes.emplace_back(image_shape, Format::U8);
+                tensor_planes.emplace_back(shape_sub2, Format::U8);
+                tensor_planes.emplace_back(shape_sub2, Format::U8);
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
+        }
+
+        return tensor_planes;
+    }
+
+    MultiImageType compute_target(const TensorShape &shape, Format format)
+    {
+        // Create tensors
+        std::vector<TensorType> ref_src = create_tensor_planes<TensorType>(shape, format);
+        MultiImageType          dst     = create_multi_image<MultiImageType>(shape, format);
+
+        // Create and configure function
+        FunctionType channel_combine;
+
+        if(1 == _num_planes)
+        {
+            const TensorType *tensor_extra = ((Format::RGBA8888 == format) ? &ref_src[3] : nullptr);
+            TensorType       *tensor_dst   = dynamic_cast<TensorType *>(dst.plane(0));
+
+            channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], tensor_extra, tensor_dst);
+        }
+        else
+        {
+            channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], &dst);
+        }
+
+        for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+        {
+            const TensorType *dst_plane = static_cast<const TensorType *>(dst.plane(plane_idx));
+
+            ARM_COMPUTE_EXPECT(dst_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
+
+        for(unsigned int plane_idx = 0; plane_idx < ref_src.size(); ++plane_idx)
+        {
+            ARM_COMPUTE_EXPECT(ref_src[plane_idx].info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
+
+        // Allocate tensors
+        dst.allocate();
+
+        for(unsigned int plane_idx = 0; plane_idx < ref_src.size(); ++plane_idx)
+        {
+            ref_src[plane_idx].allocator()->allocate();
+        }
+
+        for(unsigned int plane_idx = 0; plane_idx < _num_planes; ++plane_idx)
+        {
+            const TensorType *dst_plane = static_cast<const TensorType *>(dst.plane(plane_idx));
+
+            ARM_COMPUTE_EXPECT(!dst_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
+
+        for(unsigned int plane_idx = 0; plane_idx < ref_src.size(); ++plane_idx)
+        {
+            ARM_COMPUTE_EXPECT(!ref_src[plane_idx].info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
+
+        // Fill tensor planes
+        for(unsigned int plane_idx = 0; plane_idx < ref_src.size(); ++plane_idx)
+        {
+            fill(AccessorType(ref_src[plane_idx]), plane_idx);
+        }
+
+        // Compute function
+        channel_combine.run();
+
+        return dst;
+    }
+
+    std::vector<SimpleTensor<T>> compute_reference(const TensorShape &shape, Format format)
+    {
+        // Create reference
+        std::vector<SimpleTensor<T>> ref_src = create_tensor_planes_reference<T>(shape, format);
+
+        // Fill references
+        for(unsigned int plane_idx = 0; plane_idx < ref_src.size(); ++plane_idx)
+        {
+            fill(ref_src[plane_idx], plane_idx);
+        }
+
+        return reference::channel_combine<T>(shape, ref_src, format);
+    }
+
+    unsigned int                 _num_planes{};
+    MultiImageType               _target{};
+    std::vector<SimpleTensor<T>> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_CHANNEL_COMBINE_FIXTURE */

diff --git a/tests/validation/fixtures/ChannelShuffleLayerFixture.h b/tests/validation/fixtures/ChannelShuffleLayerFixture.h
new file mode 100644
index 0000000..9746480
--- /dev/null
+++ b/tests/validation/fixtures/ChannelShuffleLayerFixture.h

@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_CHANNEL_SHUFFLE_FIXTURE
+#define ARM_COMPUTE_TEST_CHANNEL_SHUFFLE_FIXTURE
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/ChannelShuffle.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ChannelShuffleLayerValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, unsigned int num_groups, DataType data_type)
+    {
+        _target    = compute_target(shape, data_type, num_groups);
+        _reference = compute_reference(shape, data_type, num_groups);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        library->fill_tensor_uniform(tensor, 0);
+    }
+
+    TensorType compute_target(const TensorShape &shape, DataType data_type, unsigned int num_groups)
+    {
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(shape, data_type);
+        TensorType dst = create_tensor<TensorType>(shape, data_type);
+
+        // Create and configure function
+        FunctionType channel_shuffle_func;
+        channel_shuffle_func.configure(&src, &dst, num_groups);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src));
+
+        // Compute function
+        channel_shuffle_func.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, unsigned int num_groups)
+    {
+        // Create reference
+        SimpleTensor<T> src{ shape, data_type };
+
+        // Fill reference
+        fill(src);
+
+        return reference::channel_shuffle<T>(src, num_groups);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_CHANNEL_SHUFFLE_FIXTURE */

diff --git a/tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h b/tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h
new file mode 100644
index 0000000..0fcef5c
--- /dev/null
+++ b/tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_CONVERT_FULLY_CONNECTED_WEIGHTS_FIXTURE
+#define ARM_COMPUTE_TEST_CONVERT_FULLY_CONNECTED_WEIGHTS_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/ConvertFullyConnectedWeights.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ConvertFullyConnectedWeightsValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, unsigned int weights_w, DataLayout training_data_layout, DataType data_type)
+    {
+        const unsigned int height = input_shape.x() * input_shape.y() * input_shape.z();
+        const TensorShape  weights_shape(weights_w, height);
+
+        _target    = compute_target(input_shape, weights_shape, training_data_layout, data_type);
+        _reference = compute_reference(input_shape, weights_shape, training_data_layout, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::QASYMM8:
+            {
+                std::uniform_int_distribution<uint8_t> distribution(0, 10);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::F32:
+            case DataType::F16:
+            {
+                std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+                library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const DataLayout training_data_layout, const DataType data_type)
+    {
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(weights_shape, data_type);
+        TensorType dst = create_tensor<TensorType>(weights_shape, data_type);
+
+        // Create and configure function
+        FunctionType convert_weights;
+
+        convert_weights.configure(&src, &dst, input_shape, training_data_layout);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0);
+
+        // Compute function
+        convert_weights.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const DataLayout training_data_layout, const DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{ weights_shape, data_type };
+
+        // Fill reference
+        fill(src, 0);
+
+        return reference::convert_fully_connected_weights(src, input_shape, training_data_layout);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_CONVERT_FULLY_CONNECTED_WEIGHTS_FIXTURE */

diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index 48b4665..93de24d 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,9 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Helpers.h"
+#include "tests/validation/reference/ActivationLayer.h"
 #include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/Permute.h"
 #include "tests/validation/reference/Utils.h"
 
 #include <random>
@@ -54,17 +56,18 @@
 
 public:
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, bool reshape_weights,
-               DataType data_type, int fractional_bits, QuantizationInfo quantization_info)
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights,
+               DataType data_type, DataLayout data_layout, int fractional_bits, QuantizationInfo quantization_info, ActivationLayerInfo act_info)
     {
         _data_type         = data_type;
         _is_quantized      = is_data_type_quantized_asymmetric(data_type);
         _bias_data_type    = _is_quantized ? DataType::S32 : data_type;
         _fractional_bits   = fractional_bits;
         _quantization_info = quantization_info;
+        _data_layout       = data_layout;
 
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info);
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, dilation, act_info);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, dilation, act_info);
     }
 
 protected:
@@ -97,54 +100,31 @@
         }
     }
 
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
-                              bool reshape_weights)
+    TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, TensorShape output_shape, const PadStrideInfo &info,
+                              bool reshape_weights, const Size2D &dilation, const ActivationLayerInfo act_info)
     {
-        WeightsInfo weights_info(!reshape_weights, weights_shape.x(), weights_shape.y(), weights_shape[3]);
-        TensorShape reshaped_weights_shape(weights_shape);
-
-        if(!reshape_weights)
+        if(_data_layout == DataLayout::NHWC)
         {
-            // Check if its a "fully connected" convolution
-            const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
-            bool       is_optimised                   = false;
-#if defined(__arm__)
-            is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && _data_type == DataType::F32;
-#elif defined(__aarch64__)
-            is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && _data_type == DataType::F32;
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-            reshaped_weights_shape.collapse(3);
-
-            if(bias_shape.total_size() > 0 && !_is_quantized)
-            {
-                // Add bias to the weights reshaped matrix
-                reshaped_weights_shape.set(0, reshaped_weights_shape.x() + 1);
-            }
-
-            if(is_fully_connected_convolution || is_optimised)
-            {
-                const size_t shape_x = reshaped_weights_shape.x();
-                reshaped_weights_shape.set(0, reshaped_weights_shape.y());
-                reshaped_weights_shape.set(1, shape_x);
-            }
-            else
-            {
-                const int interleave_width = 16 / data_size_from_type(_data_type);
-                reshaped_weights_shape.set(0, reshaped_weights_shape.x() * interleave_width);
-                reshaped_weights_shape.set(1, static_cast<unsigned int>(std::ceil(reshaped_weights_shape.y() / static_cast<float>(interleave_width))));
-            }
+            permute(input_shape, PermutationVector(2U, 0U, 1U));
+            permute(weights_shape, PermutationVector(2U, 0U, 1U));
+            permute(output_shape, PermutationVector(2U, 0U, 1U));
         }
 
+        const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+        const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+
+        WeightsInfo weights_info(!reshape_weights, weights_shape[idx_width], weights_shape[idx_height], weights_shape[3]);
+        TensorShape reshaped_weights_shape(weights_shape);
+
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, _data_type, 1, _fractional_bits, _quantization_info);
-        TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _data_type, 1, _fractional_bits, _quantization_info);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1, _fractional_bits, _quantization_info);
-        TensorType dst     = create_tensor<TensorType>(output_shape, _data_type, 1, _fractional_bits, _quantization_info);
+        TensorType src     = create_tensor<TensorType>(input_shape, _data_type, 1, _fractional_bits, _quantization_info, _data_layout);
+        TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _data_type, 1, _fractional_bits, _quantization_info, _data_layout);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type, 1, _fractional_bits, _quantization_info, _data_layout);
+        TensorType dst     = create_tensor<TensorType>(output_shape, _data_type, 1, _fractional_bits, _quantization_info, _data_layout);
 
         // Create and configure function
         FunctionType conv;
-        conv.configure(&src, &weights, &bias, &dst, info, weights_info);
+        conv.configure(&src, &weights, &bias, &dst, info, weights_info, dilation, act_info);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -164,55 +144,8 @@
 
         // Fill tensors
         fill(AccessorType(src), 0);
-
-        if(!reshape_weights)
-        {
-            const bool is_fully_connected_convolution = (output_shape.x() == 1 && output_shape.y() == 1);
-            bool       is_optimised                   = false;
-#if defined(__arm__)
-            is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && _data_type == DataType::F32;
-#elif defined(__aarch64__)
-            is_optimised = std::is_same<FunctionType, NEConvolutionLayer>::value && NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && _data_type == DataType::F32;
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-            TensorShape     tmp_weights_shape(weights_shape);
-            SimpleTensor<T> tmp_weights(tmp_weights_shape, _data_type, 1, _fractional_bits, _quantization_info);
-
-            // Fill with original shape
-            fill(tmp_weights, 1);
-
-            if(_is_quantized)
-            {
-                fill(AccessorType(bias), 2);
-                tmp_weights = linearise_weights(tmp_weights);
-            }
-            else
-            {
-                SimpleTensor<T> tmp_bias(bias_shape, _bias_data_type, 1, _fractional_bits, _quantization_info);
-                fill(tmp_bias, 2);
-                tmp_weights = linearise_weights(tmp_weights, &tmp_bias);
-            }
-
-            if(!is_fully_connected_convolution && !is_optimised)
-            {
-                // Transpose with interleave
-                const int interleave_size = 16 / tmp_weights.element_size();
-                tmp_weights               = transpose(std::move(tmp_weights), interleave_size);
-            }
-
-            AccessorType weights_accessor(weights);
-
-            for(int i = 0; i < tmp_weights.num_elements(); ++i)
-            {
-                Coordinates coord = index2coord(tmp_weights.shape(), i);
-                std::copy_n(static_cast<const T *>(tmp_weights(coord)), 1, static_cast<T *>(weights_accessor(coord)));
-            }
-        }
-        else
-        {
-            fill(AccessorType(weights), 1);
-            fill(AccessorType(bias), 2);
-        }
+        fill(AccessorType(weights), 1);
+        fill(AccessorType(bias), 2);
 
         // Compute NEConvolutionLayer function
         conv.run();
@@ -220,7 +153,8 @@
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                                      const Size2D &dilation, const ActivationLayerInfo act_info)
     {
         // Create reference
         SimpleTensor<T>     src{ input_shape, _data_type, 1, _fractional_bits, _quantization_info };
@@ -232,60 +166,19 @@
         fill(weights, 1);
         fill(bias, 2);
 
-        return reference::convolution_layer<T>(src, weights, bias, output_shape, info);
+        return (act_info.enabled()) ? reference::activation_layer<T>(reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation),
+                                                                     act_info) :
+               reference::convolution_layer<T>(src, weights, bias, output_shape, info, dilation);
     }
 
     TensorType       _target{};
     SimpleTensor<T>  _reference{};
     DataType         _data_type{};
     DataType         _bias_data_type{};
+    DataLayout       _data_layout{};
     int              _fractional_bits{};
     QuantizationInfo _quantization_info{};
     bool             _is_quantized = false;
-
-private:
-    template <typename U>
-    SimpleTensor<U> linearise_weights(const SimpleTensor<U> &weights, const SimpleTensor<U> *biases = nullptr)
-    {
-        TensorShape dst_shape(weights.shape());
-        dst_shape.collapse(3);
-
-        if(biases != nullptr)
-        {
-            dst_shape.set(0, dst_shape.x() + 1);
-        }
-
-        const size_t shape_x = dst_shape.x();
-        dst_shape.set(0, dst_shape.y());
-        dst_shape.set(1, shape_x);
-
-        SimpleTensor<U> dst(dst_shape, weights.data_type());
-
-        // Don't iterate over biases yet
-        for(int weights_idx = 0; weights_idx < weights.num_elements(); ++weights_idx)
-        {
-            Coordinates weights_coord = index2coord(weights.shape(), weights_idx);
-            const int   dst_row       = weights_idx % weights.shape().total_size_lower(3);
-            Coordinates dst_coord{ weights_coord[3], dst_row, weights_coord[4] };
-            const int   dst_idx = coord2index(dst.shape(), dst_coord);
-
-            dst[dst_idx] = weights[weights_idx];
-        }
-        if(biases != nullptr)
-        {
-            // Fill last row with biases
-            for(int bias_idx = 0; bias_idx < biases->num_elements(); ++bias_idx)
-            {
-                Coordinates bias_coord = index2coord(biases->shape(), bias_idx);
-                Coordinates dst_coord{ bias_coord.x(), static_cast<int>(dst.shape().y()) - 1, bias_coord.y() };
-                int         dst_idx = coord2index(dst.shape(), dst_coord);
-
-                dst[dst_idx] = (*biases)[bias_idx];
-            }
-        }
-
-        return dst;
-    }
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -293,9 +186,11 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, bool reshape_weights, DataType data_type)
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
+               DataLayout data_layout, ActivationLayerInfo act_info)
     {
-        ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, data_type, 0, QuantizationInfo());
+        ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights, data_type, data_layout, 0,
+                                                                                              QuantizationInfo(), act_info);
     }
 };
 
@@ -304,10 +199,12 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, bool reshape_weights, DataType data_type, int fractional_bits)
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
+               int fractional_bits, ActivationLayerInfo act_info)
     {
-        ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, data_type, fractional_bits,
-                                                                                              QuantizationInfo());
+        ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights, data_type,
+                                                                                              DataLayout::NCHW,
+                                                                                              fractional_bits, QuantizationInfo(), act_info);
     }
 };
 
@@ -316,10 +213,12 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, bool reshape_weights, DataType data_type,
-               QuantizationInfo quantization_info)
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
+               QuantizationInfo quantization_info, ActivationLayerInfo act_info)
     {
-        ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, data_type, 0, quantization_info);
+        ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights, data_type,
+                                                                                              DataLayout::NCHW, 0,
+                                                                                              quantization_info, act_info);
     }
 };
 } // namespace validation

diff --git a/tests/validation/fixtures/CopyFixture.h b/tests/validation/fixtures/CopyFixture.h
new file mode 100644
index 0000000..911d908
--- /dev/null
+++ b/tests/validation/fixtures/CopyFixture.h

@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_COPY_FIXTURE
+#define ARM_COMPUTE_TEST_COPY_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/Copy.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class CopyFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, TensorShape output_shape, DataType data_type)
+    {
+        _target    = compute_target(input_shape, output_shape, data_type);
+        _reference = compute_reference(input_shape, output_shape, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        library->fill_tensor_uniform(tensor, i);
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
+    {
+        // Check if indeed the input shape can be reshape to the output one
+        ARM_COMPUTE_EXPECT(input_shape.total_size() == output_shape.total_size(), framework::LogLevel::ERRORS);
+
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(input_shape, data_type);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type);
+
+        // Create and configure function
+        FunctionType copy;
+
+        copy.configure(&src, &dst);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0);
+
+        // Compute function
+        copy.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type };
+
+        // Fill reference
+        fill(src, 0);
+
+        return reference::copy<T>(src, output_shape);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_COPY_FIXTURE */

diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
index df5436f..2f01f43 100644
--- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -44,6 +45,8 @@
 {
 namespace validation
 {
+using namespace arm_compute::misc::shape_calculator;
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class DepthwiseConvolutionLayerValidationGenericFixture : public framework::Fixture
 {
@@ -52,15 +55,23 @@
 
 public:
     template <typename...>
-    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, QuantizationInfo quantization_info)
+    void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, unsigned int depth_multiplier, DataType data_type, QuantizationInfo quantization_info, DataLayout data_layout)
     {
-        _quantization_info = quantization_info;
-        _data_type         = data_type;
-        const TensorShape biases_shape(weights_shape[2]);
-        const DataType    bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+        _quantization_info            = quantization_info;
+        _data_type                    = data_type;
+        const DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
 
-        _target    = compute_target(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info);
-        _reference = compute_reference(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, data_type, bias_data_type, quantization_info);
+        TensorShape weights_shape(kernel_size.width, kernel_size.height);
+
+        const TensorInfo in_info(in_shape, 1, data_type);
+        const TensorInfo we_info(weights_shape, 1, data_type);
+        TensorShape      out_shape = compute_depthwise_convolution_shape(in_info, we_info, pad_stride_info, depth_multiplier);
+
+        weights_shape.set(2, out_shape.z());
+        const TensorShape biases_shape(weights_shape[2]);
+
+        _target    = compute_target(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, depth_multiplier, data_type, bias_data_type, quantization_info, data_layout);
+        _reference = compute_reference(in_shape, weights_shape, biases_shape, out_shape, pad_stride_info, depth_multiplier, data_type, bias_data_type, quantization_info);
     }
 
 protected:
@@ -93,18 +104,25 @@
         }
     }
 
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &biases_shape, const TensorShape &output_shape, PadStrideInfo &pad_stride_info,
-                              const DataType data_type, const DataType bias_data_type, const QuantizationInfo quantization_info)
+    TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, TensorShape biases_shape, TensorShape output_shape, PadStrideInfo &pad_stride_info, unsigned int depth_multiplier,
+                              const DataType data_type, const DataType bias_data_type, const QuantizationInfo quantization_info, const DataLayout data_layout)
     {
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(input_shape, PermutationVector(2U, 0U, 1U));
+            permute(weights_shape, PermutationVector(2U, 0U, 1U));
+            permute(output_shape, PermutationVector(2U, 0U, 1U));
+        }
+
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, 0, quantization_info);
-        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, 0, quantization_info);
-        TensorType biases  = create_tensor<TensorType>(biases_shape, bias_data_type, 1, 0, quantization_info);
-        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, 0, quantization_info);
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, 0, quantization_info, data_layout);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, 0, quantization_info, data_layout);
+        TensorType biases  = create_tensor<TensorType>(biases_shape, bias_data_type, 1, 0, quantization_info, data_layout);
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, 0, quantization_info, data_layout);
 
         // Create Depthwise Convolution configure function
         FunctionType dwc;
-        dwc.configure(&src, &weights, &biases, &dst, pad_stride_info);
+        dwc.configure(&src, &weights, &biases, &dst, pad_stride_info, depth_multiplier);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -134,7 +152,8 @@
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &in_shape, const TensorShape &weights_shape, const TensorShape &biases_shape, const TensorShape &out_shape, const PadStrideInfo &pad_stride_info,
-                                      const DataType data_type, const DataType bias_data_type, QuantizationInfo quantization_info)
+                                      unsigned int   depth_multiplier,
+                                      const DataType data_type, const DataType bias_data_type, const QuantizationInfo quantization_info)
     {
         SimpleTensor<T>     src{ in_shape, data_type, 1, 0, quantization_info };
         SimpleTensor<T>     weights{ weights_shape, data_type, 1, 0, quantization_info };
@@ -144,7 +163,7 @@
         fill(weights, 1);
         fill(biases, 2);
 
-        return reference::depthwise_convolution(src, weights, biases, out_shape, pad_stride_info);
+        return reference::depthwise_convolution(src, weights, biases, out_shape, pad_stride_info, depth_multiplier);
     }
 
     TensorType       _target{};
@@ -158,10 +177,10 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type)
+    void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, unsigned int depth_multiplier, DataType data_type, DataLayout data_layout)
     {
-        DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, weights_shape, out_shape, pad_stride_info,
-                                                                                                            data_type, QuantizationInfo());
+        DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, kernel_size, pad_stride_info, depth_multiplier,
+                                                                                                            data_type, QuantizationInfo(), data_layout);
     }
 };
 
@@ -170,10 +189,10 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape in_shape, TensorShape weights_shape, TensorShape out_shape, PadStrideInfo pad_stride_info, DataType data_type, QuantizationInfo quantization_info)
+    void setup(TensorShape in_shape, Size2D kernel_size, PadStrideInfo pad_stride_info, unsigned int depth_multiplier, DataType data_type, QuantizationInfo quantization_info, DataLayout data_layout)
     {
-        DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, weights_shape, out_shape, pad_stride_info,
-                                                                                                            data_type, quantization_info);
+        DepthwiseConvolutionLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(in_shape, kernel_size, pad_stride_info, depth_multiplier,
+                                                                                                            data_type, quantization_info, data_layout);
     }
 };
 } // namespace validation

diff --git a/tests/validation/fixtures/DirectConvolutionLayerFixture.h b/tests/validation/fixtures/DirectConvolutionLayerFixture.h
index d63a5bc..38ddf33 100644
--- a/tests/validation/fixtures/DirectConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DirectConvolutionLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -31,6 +33,7 @@
 #include "tests/validation/Helpers.h"
 #include "tests/validation/fixtures/ConvolutionLayerFixture.h"
 #include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/Permute.h"
 
 #include <random>
 
@@ -40,6 +43,8 @@
 {
 namespace validation
 {
+using namespace arm_compute::misc::shape_calculator;
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class DirectConvolutionValidationGenericFixture : public framework::Fixture
 {
@@ -49,34 +54,43 @@
 public:
     template <typename...>
     void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels,
-               DataType data_type, int fractional_bits, QuantizationInfo quantization_info)
+               DataType data_type, int fractional_bits, QuantizationInfo quantization_info, ActivationLayerInfo act_info, DataLayout data_layout)
     {
+        ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
+
         _fractional_bits   = fractional_bits;
         _quantization_info = quantization_info;
         _data_type         = data_type;
 
-        const TensorShape   weights_shape(kernel_size, kernel_size, input_shape.z(), num_kernels);
+        TensorShape         weights_shape(kernel_size, kernel_size, input_shape.z(), num_kernels);
         const TensorShape   bias_shape(num_kernels);
         const PadStrideInfo info(stride_x, stride_y, pad_x, pad_y, DimensionRoundingType::FLOOR);
-        const TensorShape   output_shape   = get_output_shape(input_shape, weights_shape, info);
         const DataType      bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
 
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info);
+        TensorInfo input_info   = TensorInfo(input_shape, 1, data_type, _fractional_bits);
+        TensorInfo weights_info = TensorInfo(weights_shape, 1, data_type, _fractional_bits);
+
+        const TensorShape output_shape = compute_deep_convolution_shape(input_info, weights_info, info);
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info, act_info, data_layout);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info, act_info);
     }
 
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info,
-               DataType data_type, int fractional_bits, QuantizationInfo quantization_info)
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation,
+               DataType data_type, int fractional_bits, QuantizationInfo quantization_info, ActivationLayerInfo act_info, DataLayout data_layout)
     {
+        ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
+        ARM_COMPUTE_UNUSED(dilation);
+
         _fractional_bits   = fractional_bits;
         _quantization_info = quantization_info;
         _data_type         = data_type;
 
         const DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
 
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info);
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info, act_info, data_layout);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, bias_data_type, fractional_bits, quantization_info, act_info);
     }
 
 protected:
@@ -109,18 +123,25 @@
         }
     }
 
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
-                              DataType data_type, DataType bias_data_type, int fixed_point_position, QuantizationInfo quantization_info)
+    TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, TensorShape output_shape, const PadStrideInfo &info,
+                              DataType data_type, DataType bias_data_type, int fixed_point_position, QuantizationInfo quantization_info, ActivationLayerInfo act_info, const DataLayout &data_layout)
     {
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(input_shape, PermutationVector(2U, 0U, 1U));
+            permute(weights_shape, PermutationVector(2U, 0U, 1U));
+            permute(output_shape, PermutationVector(2U, 0U, 1U));
+        }
+
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, fixed_point_position, quantization_info);
-        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, fixed_point_position, quantization_info);
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1, fixed_point_position, quantization_info, data_layout);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1, fixed_point_position, quantization_info, data_layout);
         TensorType bias    = create_tensor<TensorType>(bias_shape, bias_data_type, 1, fixed_point_position, quantization_info);
-        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, fixed_point_position, quantization_info);
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1, fixed_point_position, quantization_info, data_layout);
 
         // Create and configure function
         FunctionType conv;
-        conv.configure(&src, &weights, &bias, &dst, info);
+        conv.configure(&src, &weights, &bias, &dst, info, act_info);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -150,7 +171,7 @@
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
-                                      DataType data_type, DataType bias_data_type, int fixed_point_position, QuantizationInfo quantization_info)
+                                      DataType data_type, DataType bias_data_type, int fixed_point_position, QuantizationInfo quantization_info, ActivationLayerInfo act_info)
     {
         // Create reference
         SimpleTensor<T>     src{ input_shape, data_type, 1, fixed_point_position, quantization_info };
@@ -162,7 +183,9 @@
         fill(weights, 1);
         fill(bias, 2);
 
-        return reference::convolution_layer<T>(src, weights, bias, output_shape, info);
+        SimpleTensor<T> dst = reference::convolution_layer<T>(src, weights, bias, output_shape, info);
+
+        return (act_info.enabled()) ? reference::activation_layer<T>(dst, act_info) : dst;
     }
 
     TensorType       _target{};
@@ -170,21 +193,6 @@
     int              _fractional_bits{};
     QuantizationInfo _quantization_info{};
     DataType         _data_type{};
-
-private:
-    TensorShape get_output_shape(TensorShape in_shape, TensorShape kernel_shape, const PadStrideInfo &info)
-    {
-        TensorShape out_shape(in_shape);
-        const std::pair<unsigned int, unsigned int> scaled_dims = scaled_dimensions(in_shape.x(),
-                                                                                    in_shape.y(),
-                                                                                    kernel_shape.x(),
-                                                                                    kernel_shape.y(),
-                                                                                    info);
-        out_shape.set(0, scaled_dims.first);
-        out_shape.set(1, scaled_dims.second);
-        out_shape.set(2, kernel_shape[3]);
-        return out_shape;
-    }
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -192,9 +200,11 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type)
+    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type, ActivationLayerInfo act_info,
+               DataLayout data_layout)
     {
-        DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, 0, QuantizationInfo());
+        DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, 0, QuantizationInfo(),
+                                                                                                    act_info, data_layout);
     }
 };
 
@@ -203,10 +213,11 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type, int fractional_bits)
+    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type, int fractional_bits,
+               ActivationLayerInfo act_info)
     {
         DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, fractional_bits,
-                                                                                                    QuantizationInfo());
+                                                                                                    QuantizationInfo(), act_info, DataLayout::NCHW);
     }
 };
 
@@ -215,9 +226,11 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type, QuantizationInfo quantization_info)
+    void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels, DataType data_type, QuantizationInfo quantization_info,
+               ActivationLayerInfo act_info)
     {
-        DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, 0, quantization_info);
+        DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, 0, quantization_info,
+                                                                                                    act_info, DataLayout::NCHW);
     }
 };
 
@@ -226,10 +239,11 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info,
-               DataType data_type, QuantizationInfo quantization_info)
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation,
+               DataType data_type, QuantizationInfo quantization_info, ActivationLayerInfo act_info)
     {
-        DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, 0, quantization_info);
+        DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type, 0, quantization_info,
+                                                                                                    act_info, DataLayout::NCHW);
     }
 };
 
@@ -238,10 +252,11 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info,
-               DataType data_type)
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation,
+               DataType data_type, ActivationLayerInfo act_info)
     {
-        DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, 0, QuantizationInfo());
+        DirectConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type, 0, QuantizationInfo(),
+                                                                                                    act_info, DataLayout::NCHW);
     }
 };
 

diff --git a/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h b/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h
index d810a76..09b6d83 100644
--- a/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h
+++ b/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h

@@ -67,9 +67,11 @@
     }
 
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info,
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, unsigned int dilation_x, unsigned int dilation_y,
                DataType data_type, int fractional_bits, QuantizationInfo quantization_info)
     {
+        ARM_COMPUTE_UNUSED(dilation_x, dilation_y);
+
         _fractional_bits   = fractional_bits;
         _quantization_info = quantization_info;
         _data_type         = data_type;
@@ -245,10 +247,11 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info,
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, unsigned int dilation_x, unsigned int dilation_y,
                DataType data_type, QuantizationInfo quantization_info)
     {
-        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, 0, quantization_info);
+        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation_x, dilation_y, data_type, 0,
+                                                                                                               quantization_info);
     }
 };
 
@@ -257,10 +260,11 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info,
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, unsigned int dilation_x, unsigned int dilation_y,
                DataType data_type)
     {
-        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, data_type, 0, QuantizationInfo());
+        DirectConvolutionValidationGenericTensorShiftFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation_x, dilation_y, data_type, 0,
+                                                                                                               QuantizationInfo());
     }
 };
 

diff --git a/tests/validation/fixtures/FlattenLayerFixture.h b/tests/validation/fixtures/FlattenLayerFixture.h
index 3de0ba4..ef94ea8 100644
--- a/tests/validation/fixtures/FlattenLayerFixture.h
+++ b/tests/validation/fixtures/FlattenLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
@@ -43,6 +44,8 @@
 {
 namespace validation
 {
+using namespace arm_compute::misc::shape_calculator;
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class FlattenLayerValidationFixture : public framework::Fixture
 {
@@ -51,8 +54,13 @@
     void setup(TensorShape shape, DataType data_type)
     {
         _fractional_bits = is_data_type_fixed_point(data_type) ? 4 : 0;
-        _target          = compute_target(shape, data_type);
-        _reference       = compute_reference(shape, data_type);
+
+        TensorShape shape_flatten;
+        TensorInfo  input_info(shape, 1, data_type, _fractional_bits);
+        shape_flatten = compute_im2col_flatten_shape(&input_info);
+
+        _target    = compute_target(shape, shape_flatten, data_type);
+        _reference = compute_reference(shape, shape_flatten, data_type);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(_target.info()->tensor_shape(), _reference.shape());
     }
 
@@ -73,11 +81,8 @@
         }
     }
 
-    TensorType compute_target(const TensorShape &shape, DataType data_type)
+    TensorType compute_target(const TensorShape &shape, const TensorShape &shape_flatten, DataType data_type)
     {
-        TensorShape shape_flatten(shape);
-        shape_flatten.collapse(3);
-
         // Create tensors
         TensorType src = create_tensor<TensorType>(shape, data_type, 1, _fractional_bits);
         TensorType dst = create_tensor<TensorType>(shape_flatten, data_type, 1, _fractional_bits);
@@ -105,7 +110,7 @@
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type)
+    SimpleTensor<T> compute_reference(const TensorShape &shape, const TensorShape &shape_flatten, DataType data_type)
     {
         // Create reference
         SimpleTensor<T> src{ shape, data_type, 1, _fractional_bits };
@@ -113,7 +118,7 @@
         // Fill reference
         fill(src);
 
-        return reference::flatten_layer<T>(src);
+        return reference::flatten_layer<T>(src, shape_flatten);
     }
 
     TensorType      _target{};

diff --git a/tests/validation/fixtures/GEMMInterleaveBlockedFixture.h b/tests/validation/fixtures/GEMMInterleaveBlockedFixture.h
deleted file mode 100644
index 488324d..0000000
--- a/tests/validation/fixtures/GEMMInterleaveBlockedFixture.h
+++ /dev/null

@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_GEMM_INTERLEAVE_BLOCKED_FIXTURE
-#define ARM_COMPUTE_TEST_GEMM_INTERLEAVE_BLOCKED_FIXTURE
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/GEMMInterleaveBlocked.h"
-
-#include <random>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, bool Transposed = false>
-class GEMMInterleaveBlockedValidationFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(size_t x, size_t y, int int_by, int block)
-    {
-        const float       interleave_by_f32 = int_by;
-        const TensorShape shape_a(x, y);
-        const TensorShape shape_b(static_cast<size_t>(x * interleave_by_f32), static_cast<size_t>(std::ceil(y / interleave_by_f32)));
-        _target    = compute_target(shape_a, shape_b, int_by, block);
-        _reference = compute_reference(shape_a, shape_b, int_by, block);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        ARM_COMPUTE_ERROR_ON(tensor.data_type() != DataType::U8);
-        std::uniform_int_distribution<> distribution(0, 255);
-        library->fill(tensor, distribution, i);
-    }
-
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, int int_by, int block)
-    {
-        // Create tensors
-        TensorType a = create_tensor<TensorType>(shape_a, DataType::U8, 1);
-        TensorType b = create_tensor<TensorType>(shape_b, DataType::U8, 1);
-
-        // Create and configure function
-        FunctionType f;
-        f.configure(&a, &b, int_by, block, Transposed);
-
-        ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
-        a.allocator()->allocate();
-        b.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(a), 0);
-
-        // Compute GEMM function
-        f.run();
-        return b;
-    }
-
-    SimpleTensor<uint8_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, int int_by, int block)
-    {
-        // Create reference
-        SimpleTensor<uint8_t> a{ shape_a, DataType::U8, 1 };
-        SimpleTensor<uint8_t> b{ shape_b, DataType::U8, 1 };
-
-        // Fill reference
-        fill(a, 0);
-        return reference::gemm_interleave_blocked<uint8_t>(a, b, int_by, block, Transposed);
-    }
-
-    TensorType            _target{};
-    SimpleTensor<uint8_t> _reference{};
-};
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMM_INTERLEAVE_BLOCKED_FIXTURE */

diff --git a/tests/validation/fixtures/HOGDescriptorFixture.h b/tests/validation/fixtures/HOGDescriptorFixture.h
index cabee63..6097059 100644
--- a/tests/validation/fixtures/HOGDescriptorFixture.h
+++ b/tests/validation/fixtures/HOGDescriptorFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,14 +81,7 @@
         TensorInfo tensor_info_hog_descriptor(hog_info, shape.x(), shape.y());
 
         // Create HOG
-        HOGType hog = create_HOG<HOGType>(hog_info.cell_size(),
-                                          hog_info.block_size(),
-                                          hog_info.detection_window_size(),
-                                          hog_info.block_stride(),
-                                          hog_info.num_bins(),
-                                          hog_info.normalization_type(),
-                                          hog_info.l2_hyst_threshold(),
-                                          hog_info.phase_type());
+        HOGType hog = create_HOG<HOGType>(hog_info);
 
         // Create tensors
         TensorType src = create_tensor<TensorType>(shape, data_type_from_format(format));

diff --git a/tests/validation/fixtures/HOGDetectorFixture.h b/tests/validation/fixtures/HOGDetectorFixture.h
new file mode 100644
index 0000000..c2d0514
--- /dev/null
+++ b/tests/validation/fixtures/HOGDetectorFixture.h

@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_HOG_DETECTOR_FIXTURE
+#define ARM_COMPUTE_TEST_HOG_DETECTOR_FIXTURE
+
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Types.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/IHOGAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/fixtures/HOGDescriptorFixture.h"
+#include "tests/validation/reference/HOGDetector.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType,
+          typename HOGType,
+          typename DetectionWindowArrayType,
+          typename HOGDescriptorType,
+          typename AccessorType,
+          typename ArrayAccessorType,
+          typename HOGAccessorType,
+          typename HOGDetectorType,
+          typename T,
+          typename U>
+class HOGDetectorValidationFixture : public HOGDescriptorValidationFixture<TensorType, HOGType, AccessorType, HOGDescriptorType, T, U>
+{
+public:
+    template <typename...>
+    void setup(Size2D detection_window_stride, std::string image, HOGInfo hog_info, Format format, BorderMode border_mode)
+    {
+        using HDF = HOGDescriptorValidationFixture<TensorType, HOGType, AccessorType, HOGDescriptorType, T, U>;
+        HDF::setup(image, hog_info, format, border_mode);
+
+        const unsigned int max_num_detection_windows = 100000;
+
+        // Initialise descriptor (linear SVM coefficients).
+        // NOTE: Fixed values are used to keep the number of detection windows detected
+        // consistent in order to have meaningful validation tolerances.
+        // The values are "unbalanced" to reduce the number of detected objects
+        std::random_device::result_type seed       = 0;
+        std::vector<U>                  descriptor = generate_random_real(hog_info.descriptor_size(), -0.505f, 0.495f, seed);
+
+        // Compute target and reference values using feature vector from descriptor kernel
+        _target    = compute_target(HDF::_target, descriptor, max_num_detection_windows, hog_info, detection_window_stride);
+        _reference = compute_reference(HDF::_reference, descriptor, max_num_detection_windows, hog_info, detection_window_stride);
+    }
+
+protected:
+    std::vector<DetectionWindow> compute_target(const TensorType &src, const std::vector<U> &descriptor, unsigned int max_num_detection_windows,
+                                                const HOGInfo &hog_info, const Size2D &detection_window_stride)
+    {
+        // Create HOG
+        HOGType hog = create_HOG<HOGType>(hog_info);
+
+        // Create array of detection windows
+        DetectionWindowArrayType detection_windows(max_num_detection_windows);
+
+        // Copy HOG descriptor values to HOG memory
+        {
+            HOGAccessorType hog_accessor(hog);
+            std::memcpy(hog_accessor.descriptor(), descriptor.data(), descriptor.size() * sizeof(U));
+        }
+
+        // Create and configure function
+        HOGDetectorType hog_detector;
+        hog_detector.configure(&src, &hog, &detection_windows, detection_window_stride);
+
+        // Reset detection windows
+        detection_windows.clear();
+
+        // Compute function
+        hog_detector.run();
+
+        // Create array of detection windows
+        std::vector<DetectionWindow> windows;
+
+        // Copy detection windows
+        ArrayAccessorType accessor(detection_windows);
+
+        for(size_t i = 0; i < accessor.num_values(); i++)
+        {
+            DetectionWindow win;
+            win.x         = accessor.at(i).x;
+            win.y         = accessor.at(i).y;
+            win.width     = accessor.at(i).width;
+            win.height    = accessor.at(i).height;
+            win.idx_class = accessor.at(i).idx_class;
+            win.score     = accessor.at(i).score;
+
+            windows.push_back(win);
+        }
+
+        return windows;
+    }
+
+    std::vector<DetectionWindow> compute_reference(const SimpleTensor<U> &src, const std::vector<U> &descriptor, unsigned int max_num_detection_windows,
+                                                   const HOGInfo &hog_info, const Size2D &detection_window_stride)
+    {
+        // Assumes defaults value of zero for threshold and class_idx.
+        return reference::hog_detector(src, descriptor, max_num_detection_windows, hog_info, detection_window_stride);
+    }
+
+    std::vector<DetectionWindow> _target{};
+    std::vector<DetectionWindow> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_HOG_DETECTOR_FIXTURE */

diff --git a/tests/validation/fixtures/HOGMultiDetectionFixture.h b/tests/validation/fixtures/HOGMultiDetectionFixture.h
new file mode 100644
index 0000000..039f3f4
--- /dev/null
+++ b/tests/validation/fixtures/HOGMultiDetectionFixture.h

@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_FIXTURE
+#define ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_FIXTURE
+
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/IHOGAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/HOGMultiDetection.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType,
+          typename HOGType,
+          typename MultiHOGType,
+          typename DetectionWindowArrayType,
+          typename DetectionWindowStrideType,
+          typename AccessorType,
+          typename Size2DArrayAccessorType,
+          typename DetectionWindowArrayAccessorType,
+          typename HOGAccessorType,
+          typename FunctionType,
+          typename T,
+          typename U>
+class HOGMultiDetectionValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(std::string image, std::vector<HOGInfo> models, Format format, BorderMode border_mode, bool non_maxima_suppression)
+    {
+        // Only defined borders supported
+        ARM_COMPUTE_ERROR_ON(border_mode == BorderMode::UNDEFINED);
+
+        // Generate a random constant value
+        std::mt19937                     gen(library->seed());
+        std::uniform_int_distribution<T> int_dist(0, 255);
+        const T                          constant_border_value = int_dist(gen);
+
+        // Initialize descriptors vector
+        std::vector<std::vector<U>> descriptors(models.size());
+
+        // Use default values for threshold and min_distance
+        const float threshold    = 0.f;
+        const float min_distance = 1.f;
+
+        // Maximum number of detection windows per batch
+        const unsigned int max_num_detection_windows = 100000;
+
+        _target    = compute_target(image, format, border_mode, constant_border_value, models, descriptors, max_num_detection_windows, threshold, non_maxima_suppression, min_distance);
+        _reference = compute_reference(image, format, border_mode, constant_border_value, models, descriptors, max_num_detection_windows, threshold, non_maxima_suppression, min_distance);
+    }
+
+protected:
+    template <typename V>
+    void fill(V &&tensor, const std::string image, Format format)
+    {
+        library->fill(tensor, image, format);
+    }
+
+    void initialize_batch(const std::vector<HOGInfo> &models, MultiHOGType &multi_hog,
+                          std::vector<std::vector<U>> &descriptors, DetectionWindowStrideType &detection_window_strides)
+    {
+        for(unsigned i = 0; i < models.size(); ++i)
+        {
+            auto hog_model = reinterpret_cast<HOGType *>(multi_hog.model(i));
+            hog_model->init(models[i]);
+
+            // Initialise descriptor (linear SVM coefficients).
+            std::random_device::result_type seed = 0;
+            descriptors.at(i)                    = generate_random_real(models[i].descriptor_size(), -0.505f, 0.495f, seed);
+
+            // Copy HOG descriptor values to HOG memory
+            {
+                HOGAccessorType hog_accessor(*hog_model);
+                std::memcpy(hog_accessor.descriptor(), descriptors.at(i).data(), descriptors.at(i).size() * sizeof(U));
+            }
+
+            // Initialize detection window stride
+            Size2DArrayAccessorType accessor(detection_window_strides);
+            accessor.at(i) = models[i].block_stride();
+        }
+    }
+
+    std::vector<DetectionWindow> compute_target(const std::string image, Format &format, BorderMode &border_mode, T constant_border_value,
+                                                const std::vector<HOGInfo> &models, std::vector<std::vector<U>> &descriptors, unsigned int max_num_detection_windows,
+                                                float threshold, bool non_max_suppression, float min_distance)
+    {
+        MultiHOGType              multi_hog(models.size());
+        DetectionWindowArrayType  detection_windows(max_num_detection_windows);
+        DetectionWindowStrideType detection_window_strides(models.size());
+
+        // Resize detection window_strides for index access
+        detection_window_strides.resize(models.size());
+
+        // Initialiize MultiHOG and detection windows
+        initialize_batch(models, multi_hog, descriptors, detection_window_strides);
+
+        // Get image shape for src tensor
+        TensorShape shape = library->get_image_shape(image);
+
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(shape, data_type_from_format(format));
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Create and configure function
+        FunctionType hog_multi_detection;
+        hog_multi_detection.configure(&src, &multi_hog, &detection_windows, &detection_window_strides, border_mode, constant_border_value, threshold, non_max_suppression, min_distance);
+
+        // Reset detection windows
+        detection_windows.clear();
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), image, format);
+
+        // Compute function
+        hog_multi_detection.run();
+
+        // Copy detection windows
+        std::vector<DetectionWindow>     windows;
+        DetectionWindowArrayAccessorType accessor(detection_windows);
+
+        for(size_t i = 0; i < accessor.num_values(); i++)
+        {
+            DetectionWindow win;
+            win.x         = accessor.at(i).x;
+            win.y         = accessor.at(i).y;
+            win.width     = accessor.at(i).width;
+            win.height    = accessor.at(i).height;
+            win.idx_class = accessor.at(i).idx_class;
+            win.score     = accessor.at(i).score;
+
+            windows.push_back(win);
+        }
+
+        return windows;
+    }
+
+    std::vector<DetectionWindow> compute_reference(const std::string image, Format format, BorderMode border_mode, T constant_border_value,
+                                                   const std::vector<HOGInfo> &models, const std::vector<std::vector<U>> &descriptors, unsigned int max_num_detection_windows,
+                                                   float threshold, bool non_max_suppression, float min_distance)
+    {
+        // Create reference
+        SimpleTensor<T> src{ library->get_image_shape(image), data_type_from_format(format) };
+
+        // Fill reference
+        fill(src, image, format);
+
+        // NOTE: Detection window stride fixed to block stride
+        return reference::hog_multi_detection(src, border_mode, constant_border_value, models, descriptors, max_num_detection_windows, threshold, non_max_suppression, min_distance);
+    }
+
+    std::vector<DetectionWindow> _target{};
+    std::vector<DetectionWindow> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_FIXTURE */

diff --git a/tests/validation/fixtures/HarrisCornersFixture.h b/tests/validation/fixtures/HarrisCornersFixture.h
index ae262af..e3c29ae 100644
--- a/tests/validation/fixtures/HarrisCornersFixture.h
+++ b/tests/validation/fixtures/HarrisCornersFixture.h

@@ -51,7 +51,7 @@
     {
         HarrisCornersParameters params = harris_corners_parameters();
 
-        _target    = compute_target(image, gradient_size, block_size, border_mode, use_fp16, format, params);
+        _target = compute_target(image, gradient_size, block_size, border_mode, use_fp16, format, params);
         _reference = compute_reference(image, gradient_size, block_size, border_mode, format, params);
     }
 

diff --git a/tests/validation/fixtures/Im2ColFixture.h b/tests/validation/fixtures/Im2ColFixture.h
new file mode 100644
index 0000000..7ef3cdc
--- /dev/null
+++ b/tests/validation/fixtures/Im2ColFixture.h

@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_IM2COL_FIXTURE
+#define ARM_COMPUTE_TEST_IM2COL_FIXTURE
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/Im2Col.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using namespace arm_compute::misc::shape_calculator;
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class Im2ColValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, DataType data_type, const Size2D &kernel_dims, const PadStrideInfo &conv_info, const QuantizationInfo &quant_info, const DataLayout &data_layout)
+    {
+        _kernel_dims = kernel_dims;
+        _conv_info   = conv_info;
+        _quant_info  = quant_info;
+        _data_layout = data_layout;
+        _has_bias    = data_type != DataType::QASYMM8;
+
+        if(_data_layout == DataLayout::NHWC)
+        {
+            permute(input_shape, PermutationVector(2U, 0U, 1U));
+        }
+
+        TensorInfo input_info(input_shape, 1, data_type);
+        input_info.set_data_layout(_data_layout);
+
+        const TensorShape output_shape = compute_im2col_conv_shape(&input_info, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U));
+
+        _target = compute_target(input_shape, output_shape, data_type);
+
+        compute_reference(input_shape, output_shape, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        library->fill_tensor_uniform(tensor, 0);
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
+    {
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(input_shape, data_type, 1, 0, _quant_info, _data_layout);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, 0, _quant_info);
+
+        // Create and configure function
+        FunctionType im2col_func;
+        im2col_func.configure(&src, &dst, _kernel_dims, _conv_info, _has_bias);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src));
+
+        // Compute function
+        im2col_func.run();
+
+        return dst;
+    }
+
+    void compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type, 1, 0, _quant_info, _data_layout };
+        _reference = SimpleTensor<T>(output_shape, data_type, 1, 0, _quant_info, DataLayout::NCHW);
+        // Fill reference
+        fill(src);
+        reference::im2col<T>(src, _reference, _kernel_dims, _conv_info, _has_bias);
+    }
+    TensorType       _target{};
+    SimpleTensor<T>  _reference{};
+    Size2D           _kernel_dims{};
+    PadStrideInfo    _conv_info{};
+    DataLayout       _data_layout{};
+    QuantizationInfo _quant_info{};
+    bool             _has_bias{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_IM2COL_FIXTURE */

diff --git a/tests/validation/fixtures/LSTMLayerFixture.h b/tests/validation/fixtures/LSTMLayerFixture.h
new file mode 100644
index 0000000..b7e43b3
--- /dev/null
+++ b/tests/validation/fixtures/LSTMLayerFixture.h

@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_LSTM_LAYER_FIXTURE
+#define ARM_COMPUTE_TEST_LSTM_LAYER_FIXTURE
+
+#include "tests/Globals.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/ArithmeticAddition.h"
+#include "tests/validation/reference/ArithmeticSubtraction.h"
+#include "tests/validation/reference/FullyConnectedLayer.h"
+#include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/PixelWiseMultiplication.h"
+#include "tests/validation/reference/Transpose.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename FunctionParams, typename T>
+class LSTMLayerValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, TensorShape input_weights_shape, TensorShape recurrent_weights_shape, TensorShape cell_bias_shape, TensorShape output_cell_shape, TensorShape output_shape,
+               TensorShape scratch_shape, ActivationLayerInfo info, float cell_threshold, float projection_threshold, DataType data_type, bool projection_opt, bool peephole_opt)
+    {
+        _target = compute_target(input_shape, input_weights_shape, recurrent_weights_shape, cell_bias_shape, output_cell_shape, output_shape, scratch_shape, info, cell_threshold, projection_threshold,
+                                 data_type, projection_opt, peephole_opt);
+        _reference = compute_reference(input_shape, input_weights_shape, recurrent_weights_shape, cell_bias_shape, output_cell_shape, output_shape, scratch_shape, info, cell_threshold, projection_threshold,
+                                       data_type, projection_opt, peephole_opt);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+        library->fill(tensor, distribution, i);
+    }
+    template <typename U>
+    void fill_custom_val(U &&tensor, float num, int i)
+    {
+        std::uniform_real_distribution<> distribution(num, num);
+        library->fill(tensor, distribution, i);
+    }
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &input_weights_shape, const TensorShape &recurrent_weights_shape, const TensorShape &cell_bias_shape,
+                              const TensorShape &output_cell_shape, const TensorShape &output_shape, const TensorShape &scratch_shape, ActivationLayerInfo info, float cell_threshold,
+                              float projection_threshold, DataType data_type, bool projection_opt, bool peephole_opt)
+    {
+        // Create projection bias shape
+        TensorShape projection_bias_shape{};
+        projection_bias_shape.set(0, output_shape.x());
+
+        // Create tensors
+        TensorType input                 = create_tensor<TensorType>(input_shape, data_type);
+        TensorType input_to_forget_w     = create_tensor<TensorType>(input_weights_shape, data_type);
+        TensorType input_to_cell_w       = create_tensor<TensorType>(input_weights_shape, data_type);
+        TensorType input_to_output_w     = create_tensor<TensorType>(input_weights_shape, data_type);
+        TensorType recurrent_to_forget_w = create_tensor<TensorType>(recurrent_weights_shape, data_type);
+        TensorType recurrent_to_cell_w   = create_tensor<TensorType>(recurrent_weights_shape, data_type);
+        TensorType recurrent_to_output_w = create_tensor<TensorType>(recurrent_weights_shape, data_type);
+        TensorType forget_gate_bias      = create_tensor<TensorType>(cell_bias_shape, data_type);
+        TensorType cell_bias             = create_tensor<TensorType>(cell_bias_shape, data_type);
+        TensorType output_gate_bias      = create_tensor<TensorType>(cell_bias_shape, data_type);
+        TensorType output_state          = create_tensor<TensorType>(output_shape, data_type);
+        TensorType cell_state            = create_tensor<TensorType>(output_cell_shape, data_type);
+        TensorType scratch               = create_tensor<TensorType>(scratch_shape, data_type);
+        TensorType output                = create_tensor<TensorType>(output_shape, data_type);
+        TensorType input_to_input_w;
+        TensorType recurrent_to_input_w;
+        TensorType cell_to_input_w;
+        TensorType cell_to_forget_w;
+        TensorType input_gate_bias;
+        TensorType cell_to_output_w;
+        TensorType projection_w;
+        TensorType projection_bias;
+
+        bool cifg_opt = scratch_shape.x() == cell_bias_shape.x() * 4 ? true : false;
+
+        FunctionParams lstm_params;
+
+        if(!cifg_opt)
+        {
+            input_to_input_w     = create_tensor<TensorType>(input_weights_shape, data_type);
+            recurrent_to_input_w = create_tensor<TensorType>(recurrent_weights_shape, data_type);
+            cell_to_input_w      = create_tensor<TensorType>(cell_bias_shape, data_type);
+            input_gate_bias      = create_tensor<TensorType>(cell_bias_shape, data_type);
+            lstm_params.set_cifg_params(&input_to_input_w, &recurrent_to_input_w, &cell_to_input_w, &input_gate_bias);
+        }
+
+        if(peephole_opt)
+        {
+            if(cifg_opt)
+            {
+                cell_to_input_w = create_tensor<TensorType>(cell_bias_shape, data_type);
+            }
+            cell_to_forget_w = create_tensor<TensorType>(cell_bias_shape, data_type);
+            cell_to_output_w = create_tensor<TensorType>(cell_bias_shape, data_type);
+            lstm_params.set_peephole_params(&cell_to_input_w, &cell_to_forget_w, &cell_to_output_w);
+        }
+
+        if(projection_opt)
+        {
+            projection_w    = create_tensor<TensorType>(recurrent_weights_shape, data_type);
+            projection_bias = create_tensor<TensorType>(projection_bias_shape, data_type);
+            lstm_params.set_projection_params(&projection_w, &projection_bias);
+        }
+
+        // Create and configure function
+        FunctionType lstm;
+        lstm.configure(&input, &input_to_forget_w, &input_to_cell_w, &input_to_output_w, &recurrent_to_forget_w,
+                       &recurrent_to_cell_w, &recurrent_to_output_w, &forget_gate_bias, &cell_bias, &output_gate_bias, &output_state, &cell_state,
+                       &scratch, &output, lstm_params, info, cell_threshold, projection_threshold);
+
+        ARM_COMPUTE_EXPECT(input.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(input_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(input_to_cell_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(input_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(recurrent_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(recurrent_to_cell_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(recurrent_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(forget_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(cell_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(output_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(output_state.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(cell_state.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(scratch.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(output.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        input.allocator()->allocate();
+        input_to_forget_w.allocator()->allocate();
+        input_to_cell_w.allocator()->allocate();
+        input_to_output_w.allocator()->allocate();
+        recurrent_to_forget_w.allocator()->allocate();
+        recurrent_to_cell_w.allocator()->allocate();
+        recurrent_to_output_w.allocator()->allocate();
+        forget_gate_bias.allocator()->allocate();
+        cell_bias.allocator()->allocate();
+        output_gate_bias.allocator()->allocate();
+        output_state.allocator()->allocate();
+        cell_state.allocator()->allocate();
+        scratch.allocator()->allocate();
+        output.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!input.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!input_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!input_to_cell_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!input_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!recurrent_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!recurrent_to_cell_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!recurrent_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!forget_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!cell_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!output_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!output_state.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!cell_state.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!scratch.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!output.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(input), 0);
+        fill(AccessorType(input_to_forget_w), 1);
+        fill(AccessorType(input_to_cell_w), 2);
+        fill(AccessorType(input_to_output_w), 3);
+        fill(AccessorType(recurrent_to_forget_w), 4);
+        fill(AccessorType(recurrent_to_cell_w), 5);
+        fill(AccessorType(recurrent_to_output_w), 6);
+        fill(AccessorType(forget_gate_bias), 7);
+        fill(AccessorType(cell_bias), 8);
+        fill(AccessorType(output_gate_bias), 9);
+        fill(AccessorType(output_state), 10);
+        fill(AccessorType(cell_state), 11);
+        fill(AccessorType(scratch), 12);
+
+        if(!cifg_opt)
+        {
+            ARM_COMPUTE_EXPECT(input_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(recurrent_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(cell_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(input_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+            input_to_input_w.allocator()->allocate();
+            recurrent_to_input_w.allocator()->allocate();
+            cell_to_input_w.allocator()->allocate();
+            input_gate_bias.allocator()->allocate();
+            ARM_COMPUTE_EXPECT(!input_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(!recurrent_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(!cell_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(!input_gate_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+            fill(AccessorType(input_to_input_w), 13);
+            fill(AccessorType(recurrent_to_input_w), 14);
+            fill(AccessorType(cell_to_input_w), 15);
+            fill(AccessorType(recurrent_to_input_w), 16);
+            fill(AccessorType(input_gate_bias), 17);
+        }
+
+        if(peephole_opt)
+        {
+            if(cifg_opt)
+            {
+                ARM_COMPUTE_EXPECT(cell_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+                cell_to_input_w.allocator()->allocate();
+                ARM_COMPUTE_EXPECT(!cell_to_input_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+                fill(AccessorType(cell_to_input_w), 15);
+            }
+            ARM_COMPUTE_EXPECT(cell_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(cell_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            cell_to_forget_w.allocator()->allocate();
+            cell_to_output_w.allocator()->allocate();
+            ARM_COMPUTE_EXPECT(!cell_to_forget_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(!cell_to_output_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            fill(AccessorType(cell_to_output_w), 18);
+        }
+
+        if(projection_opt)
+        {
+            ARM_COMPUTE_EXPECT(projection_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(projection_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+            projection_w.allocator()->allocate();
+            projection_bias.allocator()->allocate();
+
+            ARM_COMPUTE_EXPECT(!projection_w.info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(!projection_bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+            fill(AccessorType(projection_w), 19);
+            fill(AccessorType(projection_bias), 20);
+        }
+
+        // Compute function
+        lstm.run();
+
+        return output;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &input_weights_shape, const TensorShape &recurrent_weights_shape, const TensorShape &cell_bias_shape,
+                                      const TensorShape &output_cell_shape, const TensorShape &output_shape, const TensorShape &scratch_shape, ActivationLayerInfo info, float cell_threshold,
+                                      float projection_threshold, DataType data_type, bool projection_opt, bool peephole_opt)
+    {
+        // Create projection bias shape
+        TensorShape projection_bias_shape{};
+        projection_bias_shape.set(0, output_shape.x());
+
+        TensorShape     gemm_shape{ 1, output_shape.y() };
+        SimpleTensor<T> gemm_out{ gemm_shape, data_type };
+
+        // Create reference
+        SimpleTensor<T> input{ input_shape, data_type };
+        SimpleTensor<T> input_to_input_w{ input_weights_shape, data_type };
+        SimpleTensor<T> input_to_forget_w{ input_weights_shape, data_type };
+        SimpleTensor<T> input_to_cell_w{ input_weights_shape, data_type };
+        SimpleTensor<T> input_to_output_w{ input_weights_shape, data_type };
+        SimpleTensor<T> recurrent_to_input_w{ recurrent_weights_shape, data_type };
+        SimpleTensor<T> recurrent_to_forget_w{ recurrent_weights_shape, data_type };
+        SimpleTensor<T> recurrent_to_cell_w{ recurrent_weights_shape, data_type };
+        SimpleTensor<T> recurrent_to_output_w{ recurrent_weights_shape, data_type };
+        SimpleTensor<T> cell_to_input_w{ cell_bias_shape, data_type };
+        SimpleTensor<T> cell_to_forget_w{ cell_bias_shape, data_type };
+        SimpleTensor<T> cell_to_output_w{ cell_bias_shape, data_type };
+        SimpleTensor<T> input_gate_bias{ cell_bias_shape, data_type };
+        SimpleTensor<T> forget_gate_bias{ cell_bias_shape, data_type };
+        SimpleTensor<T> cell_bias{ cell_bias_shape, data_type };
+        SimpleTensor<T> output_gate_bias{ cell_bias_shape, data_type };
+        SimpleTensor<T> projection_w{ recurrent_weights_shape, data_type };
+        SimpleTensor<T> projection_bias{ projection_bias_shape, data_type };
+        SimpleTensor<T> output_state{ output_shape, data_type };
+        SimpleTensor<T> cell_state{ output_cell_shape, data_type };
+        SimpleTensor<T> scratch{ scratch_shape, data_type };
+        SimpleTensor<T> output{ output_shape, data_type };
+
+        // Fill reference
+        fill(input, 0);
+        fill(input_to_forget_w, 1);
+        fill(input_to_cell_w, 2);
+        fill(input_to_output_w, 3);
+        fill(recurrent_to_forget_w, 4);
+        fill(recurrent_to_cell_w, 5);
+        fill(recurrent_to_output_w, 6);
+        fill(forget_gate_bias, 7);
+        fill(cell_bias, 8);
+        fill(output_gate_bias, 9);
+        fill(output_state, 10);
+        fill(cell_state, 11);
+        fill(scratch, 12);
+        fill(input_to_input_w, 13);
+        fill(recurrent_to_input_w, 14);
+        fill(cell_to_input_w, 15);
+        fill(recurrent_to_input_w, 16);
+        fill(input_gate_bias, 17);
+        fill(cell_to_output_w, 18);
+        fill(projection_w, 19);
+        fill(projection_bias, 20);
+
+        bool cifg_opt = scratch_shape.x() == cell_bias_shape.x() * 4 ? true : false;
+
+        // Compute forget_gate
+        SimpleTensor<T> fully_connected_forget = reference::fully_connected_layer(input, input_to_forget_w, forget_gate_bias, output_cell_shape);
+        SimpleTensor<T> transposed_weights     = reference::transpose(recurrent_to_forget_w);
+        SimpleTensor<T> gemm                   = reference::gemm(output_state, transposed_weights, cell_state, 1.f, 0.f);
+        SimpleTensor<T> forget_gate            = reference::arithmetic_addition(fully_connected_forget, gemm, data_type, ConvertPolicy::SATURATE);
+
+        if(peephole_opt)
+        {
+            transposed_weights = reference::transpose(cell_to_forget_w);
+            gemm               = reference::gemm(cell_state, transposed_weights, gemm_out, 1.f, 0.f);
+            forget_gate        = reference::arithmetic_addition(forget_gate, gemm, data_type, ConvertPolicy::SATURATE);
+        }
+
+        forget_gate = reference::activation_layer(forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+
+        // Compute input_gate
+        SimpleTensor<T> input_gate;
+        if(cifg_opt)
+        {
+            SimpleTensor<T> ones{ cell_bias_shape, data_type };
+            fill_custom_val(ones, 1.f, 0);
+            input_gate = reference::arithmetic_subtraction<T, T, T>(ones, forget_gate, data_type, ConvertPolicy::SATURATE);
+        }
+        else
+        {
+            SimpleTensor<T> fully_connected_input = reference::fully_connected_layer(input, input_to_input_w, input_gate_bias, output_cell_shape);
+            transposed_weights                    = reference::transpose(recurrent_to_input_w);
+            gemm                                  = reference::gemm(output_state, transposed_weights, cell_state, 1.f, 0.f);
+            input_gate                            = reference::arithmetic_addition(fully_connected_input, gemm, data_type, ConvertPolicy::SATURATE);
+            transposed_weights                    = reference::transpose(cell_to_input_w);
+            gemm                                  = reference::gemm(cell_state, transposed_weights, gemm_out, 1.f, 0.f);
+            input_gate                            = reference::arithmetic_addition(input_gate, gemm, data_type, ConvertPolicy::SATURATE);
+            input_gate                            = reference::activation_layer(input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        }
+
+        // Compute cell_state
+        SimpleTensor<T> fully_connected_cell_state = reference::fully_connected_layer(input, input_to_cell_w, cell_bias, output_cell_shape);
+        transposed_weights                         = reference::transpose(recurrent_to_cell_w);
+        gemm                                       = reference::gemm(output_state, transposed_weights, cell_state, 1.f, 0.f);
+        SimpleTensor<T> pixelwise_mul              = reference::pixel_wise_multiplication(cell_state, forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        cell_state                                 = reference::arithmetic_addition(fully_connected_cell_state, gemm, data_type, ConvertPolicy::SATURATE);
+        cell_state                                 = reference::activation_layer(cell_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        cell_state                                 = reference::pixel_wise_multiplication(cell_state, input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        cell_state                                 = reference::arithmetic_addition(cell_state, pixelwise_mul, data_type, ConvertPolicy::SATURATE);
+        if(cell_threshold != 0.f)
+        {
+            cell_state = reference::activation_layer(cell_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+        }
+
+        // Compute output
+        SimpleTensor<T> fully_connected_output = reference::fully_connected_layer(input, input_to_output_w, output_gate_bias, output_cell_shape);
+        transposed_weights                     = reference::transpose(recurrent_to_output_w);
+        gemm                                   = reference::gemm(output_state, transposed_weights, cell_state, 1.f, 0.f);
+        output                                 = reference::arithmetic_addition(fully_connected_output, gemm, data_type, ConvertPolicy::SATURATE);
+        if(peephole_opt)
+        {
+            transposed_weights = reference::transpose(cell_to_output_w);
+            gemm               = reference::gemm(cell_state, transposed_weights, gemm_out, 1.f, 0.f);
+            output             = reference::arithmetic_addition(output, gemm, data_type, ConvertPolicy::SATURATE);
+        }
+        output = reference::activation_layer(output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+
+        // Compute output state
+        SimpleTensor<T> cell_state_activation = reference::activation_layer(cell_state, info);
+        output_state                          = reference::pixel_wise_multiplication(output, cell_state_activation, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+
+        if(projection_opt)
+        {
+            SimpleTensor<T> fully_connected_projection = reference::fully_connected_layer(output_state, projection_w, projection_bias, output_cell_shape);
+            if(projection_threshold != 0.f)
+            {
+                output_state = reference::activation_layer(fully_connected_projection, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            }
+        }
+        return output_state;
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_LSTM_LAYER_FIXTURE */

diff --git a/tests/validation/fixtures/WinogradLayerFixture.h b/tests/validation/fixtures/LocallyConnectedFixture.h
similarity index 63%
rename from tests/validation/fixtures/WinogradLayerFixture.h
rename to tests/validation/fixtures/LocallyConnectedFixture.h
index d7f0cbf..f87e6e4 100644
--- a/tests/validation/fixtures/WinogradLayerFixture.h
+++ b/tests/validation/fixtures/LocallyConnectedFixture.h

@@ -21,9 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE
-#define ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE
+#ifndef ARM_COMPUTE_TEST_LOCALLY_CONNECTED_FIXTURE
+#define ARM_COMPUTE_TEST_LOCALLY_CONNECTED_FIXTURE
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
@@ -33,73 +34,65 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Helpers.h"
-#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/LocallyConnected.h"
 #include "tests/validation/reference/Utils.h"
 
 #include <random>
 
 namespace arm_compute
 {
-class NEWinogradLayer;
+class NELocallyConnected;
 
 namespace test
 {
 namespace validation
 {
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class WinogradLayerValidationFixture : public framework::Fixture
+class LocallyConnectedValidationFixture : public framework::Fixture
 {
 public:
+    using TBias = typename std::conditional<std::is_same<typename std::decay<T>::type, uint8_t>::value, int32_t, T>::type;
+
+public:
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info)
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, DataType data_type)
     {
+        ARM_COMPUTE_UNUSED(dilation);
+
+        _data_type      = data_type;
+        _bias_data_type = data_type;
+
         _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info);
         _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info);
     }
 
 protected:
     template <typename U>
-    void fill(U &&tensor, int i, float min, float max)
+    void fill(U &&tensor, int i)
     {
-        switch(tensor.data_type())
-        {
-            case DataType::F32:
-            {
-                std::uniform_real_distribution<> distribution(min, max);
-                library->fill(tensor, distribution, i);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not supported");
-                library->fill_tensor_uniform(tensor, i);
-                break;
-            }
-        }
+        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+        library->fill(tensor, distribution, i);
     }
 
     TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
     {
+        TensorShape reshaped_weights_shape(weights_shape);
+
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, DataType::F32, 1);
-        TensorType weights = create_tensor<TensorType>(weights_shape, DataType::F32, 1);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, DataType::F32, 1);
-        TensorType dst     = create_tensor<TensorType>(output_shape, DataType::F32, 1);
+        TensorType src     = create_tensor<TensorType>(input_shape, _data_type);
+        TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _data_type);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type);
+        TensorType dst     = create_tensor<TensorType>(output_shape, _data_type);
 
         // Create and configure function
-        FunctionType conv;
-        conv.configure(&src, &weights, &bias, &dst, info);
-
-        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        FunctionType locally_connected;
+        locally_connected.configure(&src, &weights, &bias, &dst, info);
 
         // Allocate tensors
         src.allocator()->allocate();
         weights.allocator()->allocate();
-        dst.allocator()->allocate();
         bias.allocator()->allocate();
+        dst.allocator()->allocate();
 
         ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -107,13 +100,11 @@
         ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
 
         // Fill tensors
-        fill(AccessorType(src), 0, -1.f, 1.f);
-        fill(AccessorType(weights), 1, -1.f, 1.f);
-        fill(AccessorType(bias), 2, -1.f, 1.f);
-        fill(AccessorType(dst), 3, -1.f, 1.f);
+        fill(AccessorType(src), 0);
+        fill(AccessorType(weights), 1);
+        fill(AccessorType(bias), 2);
 
-        // Compute NEWinogradLayer function
-        conv.run();
+        locally_connected.run();
 
         return dst;
     }
@@ -121,25 +112,25 @@
     SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
     {
         // Create reference
-        SimpleTensor<T> src{ input_shape, DataType::F32, 1 };
-        SimpleTensor<T> weights{ weights_shape, DataType::F32, 1 };
-        SimpleTensor<T> bias{ bias_shape, DataType::F32, 1 };
+        SimpleTensor<T>     src(input_shape, _data_type);
+        SimpleTensor<T>     weights(weights_shape, _data_type);
+        SimpleTensor<TBias> bias(bias_shape, _bias_data_type);
 
         // Fill reference
-        fill(src, 0, -1.f, 1.f);
-        fill(weights, 1, -1.f, 1.f);
-        fill(bias, 2, -1.f, 1.f);
+        fill(src, 0);
+        fill(weights, 1);
+        fill(bias, 2);
 
-        return reference::convolution_layer<T>(src, weights, bias, output_shape, info);
+        return reference::locally_connected<T>(src, weights, bias, output_shape, info);
     }
 
     TensorType      _target{};
     SimpleTensor<T> _reference{};
-    int             _fractional_bits{};
     DataType        _data_type{};
+    DataType        _bias_data_type{};
 };
 
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE */
+#endif /* ARM_COMPUTE_TEST_LOCALLY_CONNECTED_FIXTURE */

diff --git a/tests/validation/fixtures/OpticalFlowFixture.h b/tests/validation/fixtures/OpticalFlowFixture.h
new file mode 100644
index 0000000..f8f2021
--- /dev/null
+++ b/tests/validation/fixtures/OpticalFlowFixture.h

@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_OPTICAL_FLOW
+#define ARM_COMPUTE_TEST_OPTICAL_FLOW
+
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/Types.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/OpticalFlow.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType,
+          typename AccessorType,
+          typename ArrayType,
+          typename ArrayAccessorType,
+          typename FunctionType,
+          typename PyramidType,
+          typename PyramidFunctionType,
+          typename T>
+
+class OpticalFlowValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(std::string old_image_name, std::string new_image_name, OpticalFlowParameters params,
+               size_t num_levels, size_t num_keypoints, Format format, BorderMode border_mode)
+    {
+        std::mt19937                           gen(library->seed());
+        std::uniform_int_distribution<uint8_t> int_dist(0, 255);
+        const uint8_t                          constant_border_value = int_dist(gen);
+
+        // Create keypoints
+        std::vector<KeyPoint> old_keypoints           = generate_random_keypoints(library->get_image_shape(old_image_name), num_keypoints, library->seed(), num_levels);
+        std::vector<KeyPoint> new_keypoints_estimates = old_keypoints;
+
+        _target    = compute_target(old_image_name, new_image_name, params, num_levels, old_keypoints, new_keypoints_estimates, format, border_mode, constant_border_value);
+        _reference = compute_reference(old_image_name, new_image_name, params, num_levels, old_keypoints, new_keypoints_estimates, format, border_mode, constant_border_value);
+    }
+
+protected:
+    template <typename V>
+    void fill(V &&tensor, const std::string image, Format format)
+    {
+        library->fill(tensor, image, format);
+    }
+
+    ArrayType compute_target(std::string old_image_name, std::string new_image_name, OpticalFlowParameters params, size_t num_levels,
+                             std::vector<KeyPoint> &old_keypoints, std::vector<KeyPoint> &new_keypoints_estimates,
+                             Format format, BorderMode border_mode, uint8_t constant_border_value)
+    {
+        // Get image shapes
+        TensorShape old_shape = library->get_image_shape(old_image_name);
+        TensorShape new_shape = library->get_image_shape(new_image_name);
+
+        // Create tensors
+        auto old_image = create_tensor<TensorType>(old_shape, format);
+        auto new_image = create_tensor<TensorType>(new_shape, format);
+
+        // Load keypoints
+        ArrayType old_points(old_keypoints.size());
+        ArrayType new_points_estimates(new_keypoints_estimates.size());
+        ArrayType new_points(old_keypoints.size());
+
+        fill_array(ArrayAccessorType(old_points), old_keypoints);
+        fill_array(ArrayAccessorType(new_points_estimates), new_keypoints_estimates);
+
+        // Create pyramid images
+        PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, old_image.info()->tensor_shape(), format);
+        PyramidType old_pyramid = create_pyramid<PyramidType>(pyramid_info);
+        PyramidType new_pyramid = create_pyramid<PyramidType>(pyramid_info);
+
+        // Create and configure pyramid functions
+        PyramidFunctionType old_gp;
+        old_gp.configure(&old_image, &old_pyramid, border_mode, constant_border_value);
+
+        PyramidFunctionType new_gp;
+        new_gp.configure(&new_image, &new_pyramid, border_mode, constant_border_value);
+
+        for(size_t i = 0; i < pyramid_info.num_levels(); ++i)
+        {
+            ARM_COMPUTE_EXPECT(old_pyramid.get_pyramid_level(i)->info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(new_pyramid.get_pyramid_level(i)->info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
+
+        // Create and configure optical flow function
+        FunctionType optical_flow;
+
+        optical_flow.configure(&old_pyramid,
+                               &new_pyramid,
+                               &old_points,
+                               &new_points_estimates,
+                               &new_points,
+                               params.termination,
+                               params.epsilon,
+                               params.num_iterations,
+                               params.window_dimension,
+                               params.use_initial_estimate,
+                               border_mode,
+                               constant_border_value);
+
+        ARM_COMPUTE_EXPECT(old_image.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(new_image.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate input tensors
+        old_image.allocator()->allocate();
+        new_image.allocator()->allocate();
+
+        // Allocate pyramids
+        old_pyramid.allocate();
+        new_pyramid.allocate();
+
+        ARM_COMPUTE_EXPECT(!old_image.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!new_image.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        for(size_t i = 0; i < pyramid_info.num_levels(); ++i)
+        {
+            ARM_COMPUTE_EXPECT(!old_pyramid.get_pyramid_level(i)->info()->is_resizable(), framework::LogLevel::ERRORS);
+            ARM_COMPUTE_EXPECT(!new_pyramid.get_pyramid_level(i)->info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
+
+        // Fill tensors
+        fill(AccessorType(old_image), old_image_name, format);
+        fill(AccessorType(new_image), new_image_name, format);
+
+        // Compute functions
+        old_gp.run();
+        new_gp.run();
+        optical_flow.run();
+
+        return new_points;
+    }
+
+    std::vector<KeyPoint> compute_reference(std::string old_image_name, std::string new_image_name,
+                                            OpticalFlowParameters params, size_t num_levels,
+                                            std::vector<KeyPoint> &old_keypoints, std::vector<KeyPoint> &new_keypoints_estimates,
+                                            Format format, BorderMode border_mode, uint8_t constant_border_value)
+    {
+        SimpleTensor<T> old_image{ library->get_image_shape(old_image_name), data_type_from_format(format) };
+        SimpleTensor<T> new_image{ library->get_image_shape(new_image_name), data_type_from_format(format) };
+
+        fill(old_image, old_image_name, format);
+        fill(new_image, new_image_name, format);
+
+        return reference::optical_flow<T>(old_image, new_image, params, num_levels, old_keypoints, new_keypoints_estimates,
+                                          border_mode, constant_border_value);
+    }
+
+    ArrayType             _target{};
+    std::vector<KeyPoint> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_OPTICAL_FLOW */

diff --git a/tests/validation/fixtures/PoolingLayerFixture.h b/tests/validation/fixtures/PoolingLayerFixture.h
index 3bbb403..27b033a 100644
--- a/tests/validation/fixtures/PoolingLayerFixture.h
+++ b/tests/validation/fixtures/PoolingLayerFixture.h

@@ -47,13 +47,13 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, PoolingLayerInfo pool_info, DataType data_type, int fractional_bits, QuantizationInfo quantization_info)
+    void setup(TensorShape shape, PoolingLayerInfo pool_info, DataType data_type, DataLayout data_layout, int fractional_bits, QuantizationInfo quantization_info)
     {
         _fractional_bits   = fractional_bits;
         _quantization_info = quantization_info;
         _pool_info         = pool_info;
 
-        _target    = compute_target(shape, pool_info, data_type, fractional_bits, quantization_info);
+        _target    = compute_target(shape, pool_info, data_type, data_layout, fractional_bits, quantization_info);
         _reference = compute_reference(shape, pool_info, data_type, fractional_bits, quantization_info);
     }
 
@@ -78,11 +78,17 @@
         }
     }
 
-    TensorType compute_target(const TensorShape &shape, PoolingLayerInfo info,
-                              DataType data_type, int fixed_point_position, QuantizationInfo quantization_info)
+    TensorType compute_target(TensorShape shape, PoolingLayerInfo info,
+                              DataType data_type, DataLayout data_layout, int fixed_point_position, QuantizationInfo quantization_info)
     {
+        // Change shape in case of NHWC.
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(shape, PermutationVector(2U, 0U, 1U));
+        }
+
         // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, data_type, 1, fixed_point_position, quantization_info);
+        TensorType src = create_tensor<TensorType>(shape, data_type, 1, fixed_point_position, quantization_info, data_layout);
         TensorType dst;
 
         // Create and configure function
@@ -132,10 +138,10 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type)
+    void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, DataLayout data_layout)
     {
         PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, pad_stride_info, exclude_padding),
-                                                                                               data_type, 0, QuantizationInfo());
+                                                                                               data_type, data_layout, 0, QuantizationInfo());
     }
 };
 
@@ -147,7 +153,7 @@
     void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, int fractional_bits)
     {
         PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, pad_stride_info, exclude_padding),
-                                                                                               data_type, fractional_bits, QuantizationInfo());
+                                                                                               data_type, DataLayout::NCHW, fractional_bits, QuantizationInfo());
     }
 };
 
@@ -156,10 +162,11 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type, QuantizationInfo quantization_info)
+    void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, bool exclude_padding, DataType data_type,
+               QuantizationInfo quantization_info, DataLayout data_layout = DataLayout::NCHW)
     {
         PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, pad_stride_info, exclude_padding),
-                                                                                               data_type, 0, quantization_info);
+                                                                                               data_type, data_layout, 0, quantization_info);
     }
 };
 
@@ -168,10 +175,9 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape src_shape, TensorShape dst_shape, PoolingLayerInfo pool_info, DataType data_type)
+    void setup(TensorShape src_shape, PoolingLayerInfo pool_info, DataType data_type)
     {
-        ARM_COMPUTE_UNUSED(dst_shape);
-        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, pool_info, data_type, 0, QuantizationInfo());
+        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(src_shape, pool_info, data_type, DataLayout::NCHW, 0, QuantizationInfo());
     }
 };
 
@@ -180,9 +186,9 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, PoolingType pool_type, DataType data_type)
+    void setup(TensorShape shape, PoolingType pool_type, DataType data_type, DataLayout data_layout = DataLayout::NCHW)
     {
-        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type), data_type, 0, QuantizationInfo());
+        PoolingLayerValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, PoolingLayerInfo(pool_type), data_type, DataLayout::NCHW, 0, QuantizationInfo());
     }
 };
 

diff --git a/tests/validation/fixtures/RNNLayerFixture.h b/tests/validation/fixtures/RNNLayerFixture.h
new file mode 100644
index 0000000..42b99cc
--- /dev/null
+++ b/tests/validation/fixtures/RNNLayerFixture.h

@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_RNN_LAYER_FIXTURE
+#define ARM_COMPUTE_TEST_RNN_LAYER_FIXTURE
+
+#include "tests/Globals.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/ArithmeticAddition.h"
+#include "tests/validation/reference/FullyConnectedLayer.h"
+#include "tests/validation/reference/GEMM.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class RNNLayerValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape recurrent_weights_shape, TensorShape bias_shape, TensorShape output_shape, ActivationLayerInfo info,
+               DataType data_type)
+    {
+        _target    = compute_target(input_shape, weights_shape, recurrent_weights_shape, bias_shape, output_shape, info, data_type);
+        _reference = compute_reference(input_shape, weights_shape, recurrent_weights_shape, bias_shape, output_shape, info, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+        library->fill(tensor, distribution, i);
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &recurrent_weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape,
+                              ActivationLayerInfo info, DataType data_type)
+    {
+        // Create tensors
+        TensorType input             = create_tensor<TensorType>(input_shape, data_type);
+        TensorType weights           = create_tensor<TensorType>(weights_shape, data_type);
+        TensorType recurrent_weights = create_tensor<TensorType>(recurrent_weights_shape, data_type);
+        TensorType bias              = create_tensor<TensorType>(bias_shape, data_type);
+        TensorType hidden_state      = create_tensor<TensorType>(output_shape, data_type);
+        TensorType output            = create_tensor<TensorType>(output_shape, data_type);
+
+        // Create and configure function
+        FunctionType rnn;
+        rnn.configure(&input, &weights, &recurrent_weights, &bias, &hidden_state, &output, info);
+
+        ARM_COMPUTE_EXPECT(input.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(recurrent_weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(hidden_state.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(output.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        input.allocator()->allocate();
+        weights.allocator()->allocate();
+        recurrent_weights.allocator()->allocate();
+        bias.allocator()->allocate();
+        hidden_state.allocator()->allocate();
+        output.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!input.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!recurrent_weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!hidden_state.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!output.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(input), 0);
+        fill(AccessorType(weights), 0);
+        fill(AccessorType(recurrent_weights), 0);
+        fill(AccessorType(bias), 0);
+        fill(AccessorType(hidden_state), 0);
+
+        // Compute function
+        rnn.run();
+
+        return output;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &recurrent_weights_shape, const TensorShape &bias_shape,
+                                      const TensorShape &output_shape, ActivationLayerInfo info, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> input{ input_shape, data_type };
+        SimpleTensor<T> weights{ weights_shape, data_type };
+        SimpleTensor<T> recurrent_weights{ recurrent_weights_shape, data_type };
+        SimpleTensor<T> bias{ bias_shape, data_type };
+        SimpleTensor<T> hidden_state{ output_shape, data_type };
+
+        // Fill reference
+        fill(input, 0);
+        fill(weights, 0);
+        fill(recurrent_weights, 0);
+        fill(bias, 0);
+        fill(hidden_state, 0);
+
+        TensorShape out_shape = recurrent_weights_shape;
+        out_shape.set(1, output_shape.y());
+
+        // Compute reference
+        SimpleTensor<T> out_w{ out_shape, data_type };
+        SimpleTensor<T> fully_connected = reference::fully_connected_layer(input, weights, bias, out_shape);
+        SimpleTensor<T> gemm            = reference::gemm(hidden_state, recurrent_weights, out_w, 1.f, 0.f);
+        SimpleTensor<T> add_res         = reference::arithmetic_addition(fully_connected, gemm, data_type, ConvertPolicy::SATURATE);
+        return reference::activation_layer(add_res, info);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_RNN_LAYER_FIXTURE */

diff --git a/tests/validation/fixtures/ScaleFixture.h b/tests/validation/fixtures/ScaleFixture.h
index fe24f5b..ec10231 100644
--- a/tests/validation/fixtures/ScaleFixture.h
+++ b/tests/validation/fixtures/ScaleFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,7 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy)
+    void setup(TensorShape shape, DataType data_type, DataLayout data_layout, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy)
     {
         constexpr float max_width  = 8192.0f;
         constexpr float max_height = 6384.0f;
@@ -60,13 +60,16 @@
         float                                 scale_x = distribution_float(generator);
         float                                 scale_y = distribution_float(generator);
 
-        scale_x = ((shape.x() * scale_x) > max_width) ? (max_width / shape.x()) : scale_x;
-        scale_y = ((shape.y() * scale_y) > max_height) ? (max_height / shape.y()) : scale_y;
+        const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+        const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+        scale_x = ((shape[idx_width] * scale_x) > max_width) ? (max_width / shape[idx_width]) : scale_x;
+        scale_y = ((shape[idx_height] * scale_y) > max_height) ? (max_height / shape[idx_height]) : scale_y;
 
         std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
         T                                      constant_border_value = static_cast<T>(distribution_u8(generator));
 
-        _target    = compute_target(shape, scale_x, scale_y, policy, border_mode, constant_border_value, sampling_policy);
+        _target    = compute_target(shape, data_layout, scale_x, scale_y, policy, border_mode, constant_border_value, sampling_policy);
         _reference = compute_reference(shape, scale_x, scale_y, policy, border_mode, constant_border_value, sampling_policy);
     }
 
@@ -74,18 +77,37 @@
     template <typename U>
     void fill(U &&tensor)
     {
-        library->fill_tensor_uniform(tensor, 0);
+        if(is_data_type_float(_data_type))
+        {
+            library->fill_tensor_uniform(tensor, 0);
+        }
+        else
+        {
+            // Restrict range for float to avoid any floating point issues
+            std::uniform_real_distribution<> distribution(-5.0f, 5.0f);
+            library->fill(tensor, distribution, 0);
+        }
     }
 
-    TensorType compute_target(const TensorShape &shape, const float scale_x, const float scale_y,
+    TensorType compute_target(TensorShape shape, DataLayout data_layout, const float scale_x, const float scale_y,
                               InterpolationPolicy policy, BorderMode border_mode, T constant_border_value, SamplingPolicy sampling_policy)
     {
+        // Change shape in case of NHWC.
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(shape, PermutationVector(2U, 0U, 1U));
+        }
+
         // Create tensors
-        TensorType  src = create_tensor<TensorType>(shape, _data_type);
+        TensorType src = create_tensor<TensorType>(shape, _data_type, 1, 0, QuantizationInfo(), data_layout);
+
+        const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+        const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
         TensorShape shape_scaled(shape);
-        shape_scaled.set(0, shape[0] * scale_x);
-        shape_scaled.set(1, shape[1] * scale_y);
-        TensorType dst = create_tensor<TensorType>(shape_scaled, _data_type);
+        shape_scaled.set(idx_width, shape[idx_width] * scale_x);
+        shape_scaled.set(idx_height, shape[idx_height] * scale_y);
+        TensorType dst = create_tensor<TensorType>(shape_scaled, _data_type, 1, 0, QuantizationInfo(), data_layout);
 
         // Create and configure function
         FunctionType scale;
@@ -114,7 +136,7 @@
                                       InterpolationPolicy policy, BorderMode border_mode, T constant_border_value, SamplingPolicy sampling_policy)
     {
         // Create reference
-        SimpleTensor<T> src{ shape, _data_type };
+        SimpleTensor<T> src{ shape, _data_type, 1, 0, QuantizationInfo() };
 
         // Fill reference
         fill(src);

diff --git a/tests/validation/fixtures/UNIT/MemoryManagerFixture.h b/tests/validation/fixtures/UNIT/MemoryManagerFixture.h
new file mode 100644
index 0000000..21ad42b
--- /dev/null
+++ b/tests/validation/fixtures/UNIT/MemoryManagerFixture.h

@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_UNIT_MEMORY_MANAGER
+#define ARM_COMPUTE_TEST_UNIT_MEMORY_MANAGER
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/PoolManager.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/FullyConnectedLayer.h"
+#include "tests/validation/reference/SoftmaxLayer.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Simple test case to run two fully connected layers using a blob affinity memory manager
+ *
+ * Runs two fully connected layers back to back
+ */
+template <typename TensorType, typename AccessorType, typename AllocatorType, typename FullyConnectedFunction>
+class BlobMemoryManagerSimpleTestCaseFixture : public framework::Fixture
+{
+    using T = float;
+
+public:
+    void setup()
+    {
+        _target    = compute_target();
+        _reference = compute_reference();
+    };
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        std::uniform_real_distribution<> distribution(0.5f, 1.f);
+        library->fill(tensor, distribution, i);
+    }
+
+    TensorType compute_target()
+    {
+        auto lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+        auto pool_mgr     = std::make_shared<PoolManager>();
+        auto mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+        // Create tensors
+        TensorType w1  = create_tensor<TensorType>(TensorShape(128U, 128U), DataType::F32, 1);
+        TensorType b1  = create_tensor<TensorType>(TensorShape(128U), DataType::F32, 1);
+        TensorType w2  = create_tensor<TensorType>(TensorShape(128U, 24U), DataType::F32, 1);
+        TensorType b2  = create_tensor<TensorType>(TensorShape(24U), DataType::F32, 1);
+        TensorType src = create_tensor<TensorType>(TensorShape(128U), DataType::F32, 1);
+        TensorType fc1 = create_tensor<TensorType>(TensorShape(128U), DataType::F32, 1);
+        TensorType dst = create_tensor<TensorType>(TensorShape(24U), DataType::F32, 1);
+
+        // Create and configure function
+        FullyConnectedFunction fc_layer_1(mm);
+        FullyConnectedFunction fc_layer_2(mm);
+        fc_layer_1.configure(&src, &w1, &b1, &fc1);
+        fc_layer_2.configure(&fc1, &w2, &b2, &dst);
+
+        // Allocate tensors
+        w1.allocator()->allocate();
+        b1.allocator()->allocate();
+        w2.allocator()->allocate();
+        b2.allocator()->allocate();
+        src.allocator()->allocate();
+        fc1.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Finalize memory manager
+        mm->set_allocator(&_allocator);
+        mm->set_num_pools(1);
+        mm->finalize();
+        ARM_COMPUTE_EXPECT(mm->is_finalized(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(mm->lifetime_manager()->are_all_finalized(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0);
+        fill(AccessorType(w1), 1);
+        fill(AccessorType(b1), 2);
+        fill(AccessorType(w2), 3);
+        fill(AccessorType(b2), 4);
+
+        // Compute functions
+        fc_layer_1.run();
+        fc_layer_2.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference()
+    {
+        // Create reference
+        SimpleTensor<T> w1{ TensorShape(128U, 128U), DataType::F32 };
+        SimpleTensor<T> b1{ TensorShape(128U), DataType::F32 };
+        SimpleTensor<T> w2{ TensorShape(128U, 24U), DataType::F32 };
+        SimpleTensor<T> b2{ TensorShape(24U), DataType::F32 };
+        SimpleTensor<T> src{ TensorShape(128U), DataType::F32 };
+
+        // Fill reference
+        fill(src, 0);
+        fill(w1, 1);
+        fill(b1, 2);
+        fill(w2, 3);
+        fill(b2, 4);
+
+        auto fc1 = reference::fully_connected_layer(src, w1, b1, TensorShape(128U));
+        return reference::fully_connected_layer(fc1, w2, b2, TensorShape(24U));
+    }
+
+protected:
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+    AllocatorType   _allocator{};
+};
+
+/** Test case to run two fully connected layers using a blob affinity memory manager,
+ *  reconfigure with different shapes and rerun
+ *
+ * Runs two fully connected layers back to back then reconfigures with different batch size and reruns
+ * Shapes of the reconfigure step are smaller that the initial configured step
+ */
+template <typename TensorType, typename AccessorType, typename AllocatorType, typename FullyConnectedFunction>
+class BlobMemoryManagerReconfigureTestCaseFixture : public framework::Fixture
+{
+    using T = float;
+
+public:
+    void setup()
+    {
+        _max_batches = 8;
+        _cur_batches = 6;
+        _target      = compute_target();
+        _reference   = compute_reference();
+    };
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        std::uniform_real_distribution<> distribution(0.5f, 1.f);
+        library->fill(tensor, distribution, i);
+    }
+
+    TensorType compute_target()
+    {
+        AllocatorType allocator{};
+        auto          lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+        auto          pool_mgr     = std::make_shared<PoolManager>();
+        auto          mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+        // Create tensors
+        TensorType w1  = create_tensor<TensorType>(TensorShape(128U, 128U), DataType::F32, 1);
+        TensorType b1  = create_tensor<TensorType>(TensorShape(128U), DataType::F32, 1);
+        TensorType w2  = create_tensor<TensorType>(TensorShape(128U, 24U), DataType::F32, 1);
+        TensorType b2  = create_tensor<TensorType>(TensorShape(24U), DataType::F32, 1);
+        TensorType src = create_tensor<TensorType>(TensorShape(128U, _max_batches), DataType::F32, 1);
+        TensorType fc1 = create_tensor<TensorType>(TensorShape(128U, _max_batches), DataType::F32, 1);
+        TensorType dst = create_tensor<TensorType>(TensorShape(24U, _max_batches), DataType::F32, 1);
+
+        // Create and configure function
+        FullyConnectedFunction fc_layer_1(mm);
+        FullyConnectedFunction fc_layer_2(mm);
+        fc_layer_1.configure(&src, &w1, &b1, &fc1);
+        fc_layer_2.configure(&fc1, &w2, &b2, &dst);
+
+        // Allocate persistent tensors
+        w1.allocator()->allocate();
+        b1.allocator()->allocate();
+        w2.allocator()->allocate();
+        b2.allocator()->allocate();
+
+        // Allocate tensors (1st iteration)
+        src.allocator()->allocate();
+        fc1.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Finalize memory manager
+        mm->set_allocator(&allocator);
+        mm->set_num_pools(1);
+        mm->finalize();
+        ARM_COMPUTE_EXPECT(mm->is_finalized(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(mm->lifetime_manager()->are_all_finalized(), framework::LogLevel::ERRORS);
+
+        // Fill tensors (1st iteration)
+        fill(AccessorType(src), 0);
+        fill(AccessorType(w1), 1);
+        fill(AccessorType(b1), 2);
+        fill(AccessorType(w2), 3);
+        fill(AccessorType(b2), 4);
+
+        // Compute functions (1st iteration)
+        fc_layer_1.run();
+        fc_layer_2.run();
+
+        // Update tensor shapes (2nd iteration)
+        auto src_padding     = src.allocator()->info().padding();
+        auto fc1_padding     = fc1.allocator()->info().padding();
+        auto dst_padding     = dst.allocator()->info().padding();
+        int  diff            = _max_batches - _cur_batches;
+        auto new_src_padding = PaddingSize(src_padding.top, src_padding.right, src_padding.bottom + diff, src_padding.left);
+        auto new_fc1_padding = PaddingSize(fc1_padding.top, fc1_padding.right, fc1_padding.bottom + diff, fc1_padding.left);
+        auto new_dst_padding = PaddingSize(dst_padding.top, dst_padding.right, dst_padding.bottom + diff, dst_padding.left);
+        src.allocator()->info().set_tensor_shape(TensorShape(128U, _cur_batches)).set_is_resizable(true).extend_padding(new_src_padding);
+        src.allocator()->info().set_is_resizable(false);
+        fc1.allocator()->info().set_tensor_shape(TensorShape(128U, _cur_batches)).set_is_resizable(true).extend_padding(new_fc1_padding);
+        fc1.allocator()->info().set_is_resizable(false);
+        dst.allocator()->info().set_tensor_shape(TensorShape(24U, _cur_batches)).set_is_resizable(true).extend_padding(new_dst_padding);
+        dst.allocator()->info().set_is_resizable(false);
+
+        // Configure functions (2nd iteration)
+        fc_layer_1.configure(&src, &w1, &b1, &fc1, true, false, true);
+        fc_layer_2.configure(&fc1, &w2, &b2, &dst, true, false, true);
+
+        // Fill tensors (2nd iteration)
+        fill(AccessorType(src), 5);
+
+        // Compute functions (2nd iteration)
+        fc_layer_1.run();
+        fc_layer_2.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference()
+    {
+        // Create reference
+        SimpleTensor<T> w1{ TensorShape(128U, 128U), DataType::F32 };
+        SimpleTensor<T> b1{ TensorShape(128U), DataType::F32 };
+        SimpleTensor<T> w2{ TensorShape(128U, 24U), DataType::F32 };
+        SimpleTensor<T> b2{ TensorShape(24U), DataType::F32 };
+        SimpleTensor<T> src{ TensorShape(128U, _cur_batches), DataType::F32 };
+
+        // Fill reference
+        fill(src, 5);
+        fill(w1, 1);
+        fill(b1, 2);
+        fill(w2, 3);
+        fill(b2, 4);
+
+        auto fc1 = reference::fully_connected_layer(src, w1, b1, TensorShape(128U, _cur_batches));
+        return reference::fully_connected_layer(fc1, w2, b2, TensorShape(24U, _cur_batches));
+    }
+
+protected:
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+    AllocatorType   _allocator{};
+    unsigned int    _max_batches{};
+    unsigned int    _cur_batches{};
+};
+
+/** Test case to run a fully connected layer followed by a softmax layer using a blob affinity memory manager,
+ *  reconfigure with different shapes and rerun
+ *
+ * Runs a fully connected convolution layer followed by a softmax layer then reconfigures with different batch size and reruns
+ * Shapes of the reconfigure step are smaller that the initial configured step
+ */
+template <typename TensorType, typename AccessorType, typename AllocatorType, typename FullyConnectedFunction, typename SoftmaxFunction>
+class BlobMemoryManagerReconfigure2TestCaseFixture : public framework::Fixture
+{
+    using T = float;
+
+public:
+    void setup()
+    {
+        _max_batches = 30;
+        _cur_batches = 3;
+        _target      = compute_target();
+        _reference   = compute_reference();
+    };
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        std::uniform_real_distribution<> distribution(0.5f, 1.f);
+        library->fill(tensor, distribution, i);
+    }
+
+    TensorType compute_target()
+    {
+        AllocatorType allocator{};
+        auto          lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+        auto          pool_mgr     = std::make_shared<PoolManager>();
+        auto          mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+        // Create tensors
+        TensorType w   = create_tensor<TensorType>(TensorShape(112U, 8U), DataType::F32, 1);
+        TensorType b   = create_tensor<TensorType>(TensorShape(8U), DataType::F32, 1);
+        TensorType src = create_tensor<TensorType>(TensorShape(1U, 1U, 112U, _max_batches), DataType::F32, 1);
+        TensorType fc  = create_tensor<TensorType>(TensorShape(8U, _max_batches), DataType::F32, 1);
+        TensorType dst = create_tensor<TensorType>(TensorShape(8U, _max_batches), DataType::F32, 1);
+
+        // Create and configure function
+        FullyConnectedFunction fc_layer(mm);
+        SoftmaxFunction        smx_layer(mm);
+        fc_layer.configure(&src, &w, &b, &fc);
+        smx_layer.configure(&fc, &dst);
+
+        // Allocate persistent tensors
+        w.allocator()->allocate();
+        b.allocator()->allocate();
+
+        // Allocate tensors (1st iteration)
+        src.allocator()->allocate();
+        fc.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Finalize memory manager
+        mm->set_allocator(&allocator);
+        mm->set_num_pools(1);
+        mm->finalize();
+        ARM_COMPUTE_EXPECT(mm->is_finalized(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(mm->lifetime_manager()->are_all_finalized(), framework::LogLevel::ERRORS);
+
+        // Fill tensors (1st iteration)
+        fill(AccessorType(src), 0);
+        fill(AccessorType(w), 1);
+        fill(AccessorType(b), 2);
+
+        // Compute functions (1st iteration)
+        fc_layer.run();
+        smx_layer.run();
+
+        // Get padding requirements
+        auto fc_padding = fc.allocator()->info().padding();
+
+        // Run rest iterations
+        for(int i = _max_batches; i >= static_cast<int>(_cur_batches); --i)
+        {
+            int  diff           = _max_batches - i;
+            auto new_fc_padding = PaddingSize(fc_padding.top, fc_padding.right, fc_padding.bottom + diff, fc_padding.left);
+            src.allocator()->info().set_tensor_shape(TensorShape(1U, 1U, 112U, i));
+            fc.allocator()->info().set_tensor_shape(TensorShape(8U, i)).set_is_resizable(true).extend_padding(new_fc_padding);
+            fc.allocator()->info().set_is_resizable(false);
+            dst.allocator()->info().set_tensor_shape(TensorShape(8U, i));
+
+            // Configure functions
+            fc_layer.configure(&src, &w, &b, &fc, true, false, true);
+            smx_layer.configure(&fc, &dst);
+
+            // Fill tensors
+            fill(AccessorType(src), 3);
+
+            // Compute functions
+            fc_layer.run();
+            smx_layer.run();
+        }
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference()
+    {
+        // Create reference
+        SimpleTensor<T> w{ TensorShape(112U, 8U), DataType::F32 };
+        SimpleTensor<T> b{ TensorShape(8U), DataType::F32 };
+        SimpleTensor<T> src{ TensorShape(1U, 1U, 112U, _cur_batches), DataType::F32 };
+
+        // Fill reference
+        fill(src, 3);
+        fill(w, 1);
+        fill(b, 2);
+
+        auto fc = reference::fully_connected_layer(src, w, b, TensorShape(8U, _cur_batches));
+        return reference::softmax_layer(fc, 1.f);
+    }
+
+protected:
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+    AllocatorType   _allocator{};
+    unsigned int    _max_batches{};
+    unsigned int    _cur_batches{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_UNIT_MEMORY_MANAGER */

diff --git a/tests/validation/fixtures/WidthConcatenateLayerFixture.h b/tests/validation/fixtures/WidthConcatenateLayerFixture.h
new file mode 100644
index 0000000..cf9b12e
--- /dev/null
+++ b/tests/validation/fixtures/WidthConcatenateLayerFixture.h

@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_WIDTHCONCATENATE_LAYER_FIXTURE
+#define ARM_COMPUTE_TEST_WIDTHCONCATENATE_LAYER_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/WidthConcatenateLayer.h"
+
+#include <random>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename ITensorType, typename AccessorType, typename FunctionType, typename T>
+class WidthConcatenateLayerValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, DataType data_type)
+    {
+        // Create input shapes
+        std::mt19937                    gen(library->seed());
+        std::uniform_int_distribution<> num_dis(2, 4);
+        const int                       num_tensors = num_dis(gen);
+
+        std::vector<TensorShape>         shapes(num_tensors, shape);
+        std::bernoulli_distribution      mutate_dis(0.5f);
+        std::uniform_real_distribution<> change_dis(-0.25f, 0.f);
+
+        // Generate more shapes based on the input
+        for(auto &s : shapes)
+        {
+            // Randomly change the first dimension
+            if(mutate_dis(gen))
+            {
+                // Decrease the dimension by a small percentage. Don't increase
+                // as that could make tensor too large.
+                s.set(0, s[0] + 2 * static_cast<int>(s[0] * change_dis(gen)));
+            }
+        }
+
+        _target    = compute_target(shapes, data_type);
+        _reference = compute_reference(shapes, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        library->fill_tensor_uniform(tensor, i);
+    }
+
+    TensorType compute_target(std::vector<TensorShape> shapes, DataType data_type)
+    {
+        std::vector<TensorType>    srcs;
+        std::vector<ITensorType *> src_ptrs;
+
+        // Create tensors
+        srcs.reserve(shapes.size());
+
+        for(const auto &shape : shapes)
+        {
+            srcs.emplace_back(create_tensor<TensorType>(shape, data_type, 1, _fractional_bits));
+            src_ptrs.emplace_back(&srcs.back());
+        }
+
+        TensorShape dst_shape = misc::shape_calculator::calculate_width_concatenate_shape(src_ptrs);
+        TensorType  dst       = create_tensor<TensorType>(dst_shape, data_type, 1, _fractional_bits);
+
+        // Create and configure function
+        FunctionType width_concat;
+        width_concat.configure(src_ptrs, &dst);
+
+        for(auto &src : srcs)
+        {
+            ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
+
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        for(auto &src : srcs)
+        {
+            src.allocator()->allocate();
+            ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
+
+        dst.allocator()->allocate();
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        int i = 0;
+        for(auto &src : srcs)
+        {
+            fill(AccessorType(src), i++);
+        }
+
+        // Compute function
+        width_concat.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(std::vector<TensorShape> shapes, DataType data_type)
+    {
+        std::vector<SimpleTensor<T>> srcs;
+
+        // Create and fill tensors
+        int i = 0;
+        for(const auto &shape : shapes)
+        {
+            srcs.emplace_back(shape, data_type, 1, _fractional_bits);
+            fill(srcs.back(), i++);
+        }
+
+        return reference::widthconcatenate_layer<T>(srcs);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+
+private:
+    int _fractional_bits{ 1 };
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_WIDTHCONCATENATE_LAYER_FIXTURE */

diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
new file mode 100644
index 0000000..ef596e0
--- /dev/null
+++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h

@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE
+#define ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/Utils.h"
+#include "tests/validation/reference/Winograd.h"
+
+#include <random>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using namespace arm_compute::misc::shape_calculator;
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool use_bias = true>
+class WinogradConvolutionLayerValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, DataType data_type, ActivationLayerInfo act_info)
+    {
+        ARM_COMPUTE_UNUSED(dilation);
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, float min, float max)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<> distribution(min, max);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                library->fill_tensor_uniform(tensor, i);
+                break;
+            }
+        }
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                              DataType data_type, ActivationLayerInfo act_info)
+    {
+        // Create tensors
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, data_type, 1);
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1);
+
+        // Create and configure function
+        FunctionType conv;
+        ARM_COMPUTE_EXPECT(static_cast<bool>(conv.validate(src.info(), weights.info(), (use_bias) ? bias.info() : nullptr, dst.info(), info, act_info)), framework::LogLevel::ERRORS);
+        conv.configure(&src, &weights, (use_bias) ? &bias : nullptr, &dst, info, act_info);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        weights.allocator()->allocate();
+        dst.allocator()->allocate();
+        bias.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0, -1.f, 1.f);
+        fill(AccessorType(weights), 1, -1.f, 1.f);
+        fill(AccessorType(bias), 2, -1.f, 1.f);
+
+        // Compute Winograd Convolution function
+        conv.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                                      DataType data_type, ActivationLayerInfo act_info)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type, 1 };
+        SimpleTensor<T> weights{ weights_shape, data_type, 1 };
+        SimpleTensor<T> bias{ bias_shape, data_type, 1 };
+
+        // Fill reference
+        fill(src, 0, -1.f, 1.f);
+        fill(weights, 1, -1.f, 1.f);
+        if(use_bias)
+        {
+            fill(bias, 2, -1.f, 1.f);
+        }
+        else
+        {
+            fill(bias, 2, 0.f, 0.f);
+        }
+
+        SimpleTensor<T> conv_out = reference::convolution_layer<T>(src, weights, bias, output_shape, info);
+
+        return (act_info.enabled()) ? reference::activation_layer<T>(conv_out, act_info) : conv_out;
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool use_bias = true>
+class WinogradConvolutionLayerFastMathValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, DataType data_type, ActivationLayerInfo act_info)
+    {
+        ARM_COMPUTE_UNUSED(dilation);
+
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type, act_info);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, float min, float max)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<> distribution(min, max);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                library->fill_tensor_uniform(tensor, i);
+                break;
+            }
+        }
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                              DataType data_type, ActivationLayerInfo act_info)
+    {
+        // Create tensors
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, data_type, 1);
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1);
+
+        // Create and configure function
+        FunctionType conv;
+        ARM_COMPUTE_EXPECT(static_cast<bool>(conv.validate(src.info(), weights.info(), (use_bias) ? bias.info() : nullptr, dst.info(), info, act_info, true /* Enable fast math */)),
+                           framework::LogLevel::ERRORS);
+        conv.configure(&src, &weights, (use_bias) ? &bias : nullptr, &dst, info, act_info, true /* Enable fast math */);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        weights.allocator()->allocate();
+        dst.allocator()->allocate();
+        bias.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0, -1.f, 1.f);
+        fill(AccessorType(weights), 1, -1.f, 1.f);
+        fill(AccessorType(bias), 2, -1.f, 1.f);
+
+        // Compute Winograd Convolution function
+        conv.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                                      DataType data_type, ActivationLayerInfo act_info)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type, 1 };
+        SimpleTensor<T> weights{ weights_shape, data_type, 1 };
+        SimpleTensor<T> bias{ bias_shape, data_type, 1 };
+
+        // Fill reference
+        fill(src, 0, -1.f, 1.f);
+        fill(weights, 1, -1.f, 1.f);
+        if(use_bias)
+        {
+            fill(bias, 2, -1.f, 1.f);
+        }
+        else
+        {
+            fill(bias, 2, 0.f, 0.f);
+        }
+
+        WinogradInfo winograd_info(Size2D(4U, 4U),
+                                   Size2D(weights_shape[0], weights_shape[1]),
+                                   Size2D(input_shape[0], input_shape[1]),
+                                   info,
+                                   src.data_layout());
+
+        // Compute tensor shapes for input, filter and output transforms
+        TensorShape input_transform_shape  = compute_winograd_input_transform_shape(TensorInfo(input_shape, 1, data_type), winograd_info);
+        TensorShape filter_transform_shape = compute_winograd_filter_transform_shape(TensorInfo(weights_shape, 1, data_type), winograd_info);
+        TensorShape batched_gemm_shape     = input_transform_shape;
+        batched_gemm_shape[0]              = filter_transform_shape[0];
+        TensorShape output_transform_shape = compute_winograd_output_transform_shape(TensorInfo(batched_gemm_shape, 1, data_type), winograd_info);
+
+        // Dummy matrix C to perform matrix multiplication
+        SimpleTensor<T> dummy_c{ batched_gemm_shape, data_type, 1 };
+
+        // Compute Winograd-based convolution
+        SimpleTensor<T> input_transform_out  = reference::winograd_input_transform<T>(src, input_transform_shape, winograd_info);
+        SimpleTensor<T> filter_transform_out = reference::winograd_filter_transform<T>(weights, filter_transform_shape, winograd_info);
+        SimpleTensor<T> batched_gemm         = reference::gemm<T>(input_transform_out, filter_transform_out, dummy_c, 1.0f, 0.0f);
+        SimpleTensor<T> conv_out             = reference::winograd_output_transform<T>(batched_gemm, bias, output_transform_shape, winograd_info);
+
+        return (act_info.enabled()) ? reference::activation_layer<T>(conv_out, act_info) : conv_out;
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class WinogradInputTransformValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, WinogradInfo winograd_info, DataLayout data_layout, DataType data_type)
+    {
+        TensorShape output_shape = compute_winograd_input_transform_shape(TensorInfo(input_shape, 1, data_type), winograd_info);
+
+        _target    = compute_target(input_shape, output_shape, winograd_info, data_layout, data_type);
+        _reference = compute_reference(input_shape, output_shape, winograd_info, data_layout, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, float min, float max)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<> distribution(min, max);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                library->fill_tensor_uniform(tensor, i);
+                break;
+            }
+        }
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, const WinogradInfo &winograd_info, DataLayout data_layout, DataType data_type)
+    {
+        TensorType src = create_tensor<TensorType>(input_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+
+        // Create and configure function
+        FunctionType transf;
+        transf.configure(&src, &dst, winograd_info);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0, -1.f, 1.f);
+
+        // Compute Winograd input transform function
+        transf.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, const WinogradInfo &winograd_info, DataLayout data_layout, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type, 1, 0, QuantizationInfo(), data_layout };
+
+        // Fill reference
+        fill(src, 0, -1.f, 1.f);
+
+        return reference::winograd_input_transform<T>(src, output_shape, winograd_info);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class WinogradFilterTransformValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, Size2D output_tile, DataLayout data_layout, DataType data_type)
+    {
+        WinogradInfo winograd_info(output_tile, Size2D(input_shape[0], input_shape[1]), Size2D() /* Not needed */, PadStrideInfo() /* Not needed */, DataLayout::NCHW /* Not needed */);
+        TensorShape  output_shape = compute_winograd_filter_transform_shape(TensorInfo(input_shape, 1, data_type), winograd_info);
+
+        _target    = compute_target(input_shape, output_shape, winograd_info, data_layout, data_type);
+        _reference = compute_reference(input_shape, output_shape, winograd_info, data_layout, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, float min, float max)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<> distribution(min, max);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                library->fill_tensor_uniform(tensor, i);
+                break;
+            }
+        }
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, const WinogradInfo &winograd_info, DataLayout data_layout, DataType data_type)
+    {
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(input_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+
+        // Create and configure function
+        FunctionType filter_transform;
+        filter_transform.configure(&src, &dst, winograd_info);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0, -1.f, 1.f);
+
+        filter_transform.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, const WinogradInfo &winograd_info, DataLayout data_layout, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type, 1, 0, QuantizationInfo(), data_layout };
+
+        // Fill reference
+        fill(src, 0, -1.f, 1.f);
+
+        return reference::winograd_filter_transform<T>(src, output_shape, winograd_info);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class WinogradOutputTransformValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, WinogradInfo winograd_info, DataType data_type)
+    {
+        TensorShape output_shape = compute_winograd_output_transform_shape(TensorInfo(input_shape, 1, data_type), winograd_info);
+
+        _target    = compute_target(input_shape, output_shape, winograd_info, data_type);
+        _reference = compute_reference(input_shape, output_shape, winograd_info, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, float min, float max)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<> distribution(min, max);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                library->fill_tensor_uniform(tensor, i);
+                break;
+            }
+        }
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, const WinogradInfo &winograd_info, DataType data_type)
+    {
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(input_shape, data_type);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, 0, QuantizationInfo(), winograd_info.output_data_layout);
+
+        // Create and configure function
+        FunctionType output_transform;
+        output_transform.configure(&src, nullptr, &dst, winograd_info);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0, -1.f, 1.f);
+
+        output_transform.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, const WinogradInfo &winograd_info, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type };
+        SimpleTensor<T> bias{ TensorShape(input_shape[0]), data_type };
+
+        // Fill reference
+        fill(src, 0, -1.f, 1.f);
+        fill(bias, 1, 0.0f, 0.0f); // Fill with zeros as we validate just the output transform without bias contribution
+
+        return reference::winograd_output_transform<T>(src, bias, output_shape, winograd_info);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_WINOGRAD_LAYER_FIXTURE */

diff --git a/tests/validation/reference/BatchNormalizationLayer.cpp b/tests/validation/reference/BatchNormalizationLayer.cpp
index a9d9f03..c8badac 100644
--- a/tests/validation/reference/BatchNormalizationLayer.cpp
+++ b/tests/validation/reference/BatchNormalizationLayer.cpp

@@ -106,7 +106,6 @@
                     const float numerator   = src[pos] - mean[i];
                     const float x_bar       = numerator / denominator;
                     result[pos]             = beta[i] + x_bar * gamma[i];
-                    ;
                 }
             }
         }

diff --git a/tests/validation/reference/ChannelCombine.cpp b/tests/validation/reference/ChannelCombine.cpp
new file mode 100644
index 0000000..c1ec3ec
--- /dev/null
+++ b/tests/validation/reference/ChannelCombine.cpp

@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ChannelCombine.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/FixedPoint.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+template <typename T>
+inline std::vector<SimpleTensor<T>> create_image_planes(const TensorShape &shape, Format format)
+{
+    TensorShape image_shape = adjust_odd_shape(shape, format);
+
+    std::vector<SimpleTensor<T>> image_planes;
+
+    switch(format)
+    {
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            image_planes.emplace_back(image_shape, format);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            TensorShape shape_uv88 = calculate_subsampled_shape(image_shape, Format::UV88);
+
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(shape_uv88, Format::UV88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorShape shape_sub2 = calculate_subsampled_shape(image_shape, Format::IYUV);
+
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(shape_sub2, Format::U8);
+            image_planes.emplace_back(shape_sub2, Format::U8);
+            break;
+        }
+        case Format::YUV444:
+        {
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(image_shape, Format::U8);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    return image_planes;
+}
+} // namespace
+
+template <typename T>
+std::vector<SimpleTensor<T>> channel_combine(const TensorShape &shape, const std::vector<SimpleTensor<T>> &image_planes, Format format)
+{
+    std::vector<SimpleTensor<T>> dst = create_image_planes<T>(shape, format);
+
+    for(unsigned int plane_idx = 0; plane_idx < dst.size(); ++plane_idx)
+    {
+        SimpleTensor<T> &dst_tensor = dst[plane_idx];
+
+        for(int element_idx = 0; element_idx < dst_tensor.num_elements(); ++element_idx)
+        {
+            Coordinates coord = index2coord(dst_tensor.shape(), element_idx);
+
+            switch(format)
+            {
+                case Format::RGB888:
+                case Format::RGBA8888:
+                {
+                    // Copy R/G/B or A channel
+                    for(int channel_idx = 0; channel_idx < dst_tensor.num_channels(); ++channel_idx)
+                    {
+                        const T &src_value = reinterpret_cast<const T *>(image_planes[channel_idx](coord))[0];
+                        T       &dst_value = reinterpret_cast<T *>(dst_tensor(coord))[channel_idx];
+
+                        dst_value = src_value;
+                    }
+                    break;
+                }
+                case Format::YUYV422:
+                case Format::UYVY422:
+                {
+                    // Find coordinates of the sub-sampled pixel
+                    const Coordinates coord_hori(coord.x() / 2, coord.y());
+
+                    const T &src0 = reinterpret_cast<const T *>(image_planes[0](coord))[0];
+                    const T &src1 = reinterpret_cast<const T *>(image_planes[1](coord_hori))[0];
+
+                    const int shift = (Format::YUYV422 == format) ? 1 : 0;
+                    T        &dst0  = reinterpret_cast<T *>(dst_tensor(coord))[1 - shift];
+                    T        &dst1  = reinterpret_cast<T *>(dst_tensor(coord))[0 + shift];
+
+                    dst0 = src0;
+                    dst1 = src1;
+
+                    Coordinates coord2 = index2coord(dst_tensor.shape(), ++element_idx);
+
+                    const T &src2 = reinterpret_cast<const T *>(image_planes[0](coord2))[0];
+                    const T &src3 = reinterpret_cast<const T *>(image_planes[2](coord_hori))[0];
+
+                    T &dst2 = reinterpret_cast<T *>(dst_tensor(coord2))[1 - shift];
+                    T &dst3 = reinterpret_cast<T *>(dst_tensor(coord2))[0 + shift];
+
+                    dst2 = src2;
+                    dst3 = src3;
+
+                    break;
+                }
+                case Format::NV12:
+                case Format::NV21:
+                {
+                    if(0U == plane_idx)
+                    {
+                        // Get and combine Y channel from plane0 of destination multi-image
+                        dst_tensor[element_idx] = image_planes[0][element_idx];
+                    }
+                    else
+                    {
+                        const int shift = (Format::NV12 == format) ? 0 : 1;
+
+                        // Get U channel from plane1 and V channel from plane2 of the source
+                        const T &src_u0 = reinterpret_cast<const T *>(image_planes[1](coord))[0];
+                        const T &src_v0 = reinterpret_cast<const T *>(image_planes[2](coord))[0];
+
+                        // Get U and V channel from plane1 of destination multi-image
+                        T &dst_u0 = reinterpret_cast<T *>(dst_tensor(coord))[0 + shift];
+                        T &dst_v0 = reinterpret_cast<T *>(dst_tensor(coord))[1 - shift];
+
+                        // Combine channel U and V
+                        dst_u0 = src_u0;
+                        dst_v0 = src_v0;
+                    }
+
+                    break;
+                }
+                case Format::IYUV:
+                case Format::YUV444:
+                {
+                    // Get Y/U/V element
+                    const T &src = reinterpret_cast<const T *>(image_planes[plane_idx](coord))[0];
+                    T       &dst = reinterpret_cast<T *>(dst_tensor(coord))[0];
+
+                    // Copy Y/U/V plane
+                    dst = src;
+
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+        }
+    }
+
+    return dst;
+}
+
+template std::vector<SimpleTensor<uint8_t>> channel_combine(const TensorShape &shape, const std::vector<SimpleTensor<uint8_t>> &image_planes, Format format);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/ChannelCombine.h b/tests/validation/reference/ChannelCombine.h
new file mode 100644
index 0000000..cc6607d
--- /dev/null
+++ b/tests/validation/reference/ChannelCombine.h

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_CHANNEL_COMBINE_H__
+#define __ARM_COMPUTE_TEST_CHANNEL_COMBINE_H__
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+std::vector<SimpleTensor<T>> channel_combine(const TensorShape &shape, const std::vector<SimpleTensor<T>> &image_planes, Format format);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_CHANNEL_COMBINE_H__ */

diff --git a/tests/validation/reference/ChannelShuffle.cpp b/tests/validation/reference/ChannelShuffle.cpp
new file mode 100644
index 0000000..c4d8d50
--- /dev/null
+++ b/tests/validation/reference/ChannelShuffle.cpp

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ChannelShuffle.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+// Refence implementation for channel shuffle taken from https://github.com/pytorch/pytorch/blob/master/caffe2/operators/channel_shuffle_op.h
+template <typename T>
+SimpleTensor<T> channel_shuffle(const SimpleTensor<T> &src, int num_groups)
+{
+    // Create reference
+    SimpleTensor<T> dst{ src.shape(), src.data_type(), src.num_channels(), src.fixed_point_position(), src.quantization_info() };
+
+    const int M                 = src.shape()[0];
+    const int N                 = src.shape()[1];
+    const int num_channels      = src.shape()[2];
+    const int batches           = src.shape()[3];
+    const int MxN               = M * N;
+    const int channels_in_group = num_channels / num_groups;
+
+    const T *src_ref = src.data();
+    T       *dst_ref = dst.data();
+
+    for(int n = 0; n < batches; ++n)
+    {
+        for(int g = 0; g < num_groups; ++g)
+        {
+            // Gather the group g block (of size channels_in_group * MxN) from output channels
+            // g + 0 * G, g + 1 * G, g + 2 * G, g + G * (K - 1) etc.
+            const T *src_ptr = src_ref + g * channels_in_group * MxN + n * num_channels * MxN;
+            T       *dst_ptr = dst_ref + g * MxN + n * num_channels * MxN;
+            for(int i = 0; i < channels_in_group; ++i)
+            {
+                std::copy(src_ptr + i * MxN,
+                          src_ptr + (i + 1) * MxN,
+                          dst_ptr + i * num_groups * MxN);
+            }
+        }
+    }
+
+    return dst;
+}
+
+template SimpleTensor<uint8_t> channel_shuffle(const SimpleTensor<uint8_t> &src, int num_groups);
+template SimpleTensor<uint16_t> channel_shuffle(const SimpleTensor<uint16_t> &src, int num_groups);
+template SimpleTensor<uint32_t> channel_shuffle(const SimpleTensor<uint32_t> &src, int num_groups);
+template SimpleTensor<half> channel_shuffle(const SimpleTensor<half> &src, int num_groups);
+template SimpleTensor<float> channel_shuffle(const SimpleTensor<float> &src, int num_groups);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/ChannelShuffle.h b/tests/validation/reference/ChannelShuffle.h
new file mode 100644
index 0000000..52df19e
--- /dev/null
+++ b/tests/validation/reference/ChannelShuffle.h

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_CHANNEL_SHUFFLE_H__
+#define __ARM_COMPUTE_TEST_CHANNEL_SHUFFLE_H__
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> channel_shuffle(const SimpleTensor<T> &src, int num_groups);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_CHANNEL_SHUFFLE_H__ */

diff --git a/tests/validation/reference/ConvertFullyConnectedWeights.cpp b/tests/validation/reference/ConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..b0f537f
--- /dev/null
+++ b/tests/validation/reference/ConvertFullyConnectedWeights.cpp

@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ConvertFullyConnectedWeights.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> convert_fully_connected_weights(const SimpleTensor<T> &src, const TensorShape &original_input_shape, const DataLayout training_data_layout)
+{
+    SimpleTensor<T> dst(src.shape(), src.data_type());
+
+    const bool         is_nchw_to_nhwc           = training_data_layout == DataLayout::NCHW;
+    const unsigned int num_elems_per_input_plane = original_input_shape.x() * original_input_shape.y();
+    const unsigned int num_channels              = original_input_shape.z();
+    const unsigned int factor_1                  = is_nchw_to_nhwc ? num_elems_per_input_plane : num_channels;
+    const unsigned int factor_2                  = is_nchw_to_nhwc ? num_channels : num_elems_per_input_plane;
+
+    for(int i = 0; i < src.num_elements(); ++i)
+    {
+        const Coordinates coords_in = index2coords(src.shape(), i);
+        const Coordinates coords_out(coords_in.x(), coords_in.y() % factor_1 * factor_2 + coords_in.y() / factor_1);
+
+        dst[coords2index(dst.shape(), coords_out)] = src[i];
+    }
+
+    return dst;
+}
+
+template SimpleTensor<uint8_t> convert_fully_connected_weights(const SimpleTensor<uint8_t> &src, const TensorShape &original_input_shape,
+                                                               const DataLayout training_data_layout);
+template SimpleTensor<half> convert_fully_connected_weights(const SimpleTensor<half> &src, const TensorShape &original_input_shape,
+                                                            const DataLayout training_data_layout);
+template SimpleTensor<float> convert_fully_connected_weights(const SimpleTensor<float> &src, const TensorShape &original_input_shape,
+                                                             const DataLayout training_data_layout);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/ConvertFullyConnectedWeights.h b/tests/validation/reference/ConvertFullyConnectedWeights.h
new file mode 100644
index 0000000..a9bbf13
--- /dev/null
+++ b/tests/validation/reference/ConvertFullyConnectedWeights.h

@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_CONVERT_FULLY_CONNECTED_WEIGHTS_H__
+#define __ARM_COMPUTE_TEST_CONVERT_FULLY_CONNECTED_WEIGHTS_H__
+
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> convert_fully_connected_weights(const SimpleTensor<T> &src, const TensorShape &original_input_shape, const DataLayout training_data_layout);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_CONVERT_FULLY_CONNECTED_WEIGHTS_H__ */

diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h
new file mode 100644
index 0000000..7001758
--- /dev/null
+++ b/tests/validation/reference/Convolution3d.h

@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *asymm_int_mult
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, asymm_int_multDAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__
+#define __ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__
+
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "tests/validation/FixedPoint.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/UtilsQuantizedAsymm.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace convolution_3d
+{
+namespace detail
+{
+inline bool is_valid_pixel(int i, int min, int max)
+{
+    return (i >= min && i < max);
+}
+
+// 3D convolution for floating point type
+template < typename T, typename TB, typename std::enable_if < validation::is_floating_point<T>::value &&validation::is_floating_point<TB>::value, int >::type = 0 >
+inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
+                          int i_offset, int w_offset, int b_offset, int o_offset,
+                          int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1)
+{
+    const T *in_ptr  = in.data() + i_offset;
+    const T *w_ptr   = weights.data() + w_offset;
+    const TB *b_ptr   = bias.data() + b_offset;
+    T        *out_ptr = out.data() + o_offset;
+
+    const int half_width_weights_start  = width_weights / 2;
+    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
+    const int half_height_weights_start = height_weights / 2;
+    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
+
+    // Reset accumulator
+    T acc(0);
+
+    // Compute a 2D convolution for each IFM and accumulate the result
+    for(int ifm = 0; ifm < depth_in; ++ifm)
+    {
+        // Compute the offset for the input slice
+        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
+
+        // Compute 2D convolution
+        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
+        {
+            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
+            {
+                // Check if the pixel is out-of-bound
+                if(is_valid_pixel(xi + xk * dilation_x, 0, width_in) && is_valid_pixel(yi + yk * dilation_y, 0, height_in))
+                {
+                    const int idx = xk + half_width_weights_start;
+                    const int idy = yk + half_height_weights_start;
+
+                    const T i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
+                    const T w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
+
+                    acc += i_value * w_value;
+                }
+            }
+        }
+    }
+
+    // Accumulate the bias and store the result
+    *out_ptr = acc + (*b_ptr);
+}
+
+// 3D convolution for fixed point type
+template < typename T, typename TB, typename std::enable_if < std::is_integral<T>::value &&std::is_integral<TB>::value, int >::type = 0 >
+inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
+                          int i_offset, int w_offset, int b_offset, int o_offset,
+                          int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1)
+{
+    const T *in_ptr               = in.data() + i_offset;
+    const T *w_ptr                = weights.data() + w_offset;
+    const T *b_ptr                = bias.data() + b_offset;
+    T       *out_ptr              = out.data() + o_offset;
+    int      fixed_point_position = in.fixed_point_position();
+
+    const int half_width_weights_start  = width_weights / 2;
+    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
+    const int half_height_weights_start = height_weights / 2;
+    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
+
+    using namespace fixed_point_arithmetic;
+    using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
+
+    // Reset accumulator
+    fixed_point<promoted_type> acc(0, fixed_point_position);
+
+    // Compute a 2D convolution for each IFM and accumulate the result
+    for(int ifm = 0; ifm < depth_in; ++ifm)
+    {
+        // Compute the offset for the input slice
+        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
+
+        // Compute 2D convolution
+        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
+        {
+            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
+            {
+                // Check if the pixel is out-of-bound
+                if(is_valid_pixel(xi + xk * dilation_x, 0, width_in) && is_valid_pixel(yi + yk * dilation_y, 0, height_in))
+                {
+                    const int idx = xk + half_width_weights_start;
+                    const int idy = yk + half_height_weights_start;
+
+                    const fixed_point<promoted_type> i_value(in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in], fixed_point_position, true);
+                    const fixed_point<promoted_type> w_value(w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights], fixed_point_position, true);
+                    const fixed_point<promoted_type> iw = i_value * w_value;
+                    acc                                 = iw + acc;
+                }
+            }
+        }
+    }
+
+    // Get the bias
+    const fixed_point<promoted_type> b(*b_ptr, fixed_point_position, true);
+
+    // Accumulate the bias and covert back
+    acc = acc + b;
+    fixed_point<T> res(acc);
+    *out_ptr = res.raw();
+}
+
+// 3D convolution for QASYMM8 type
+template <>
+inline void convolution3d(const SimpleTensor<uint8_t> &in, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<uint8_t> &out,
+                          int i_offset, int w_offset, int b_offset, int o_offset,
+                          int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x, int dilation_y)
+{
+    const uint8_t *in_ptr  = in.data() + i_offset;
+    const uint8_t *w_ptr   = weights.data() + w_offset;
+    const int32_t *b_ptr   = bias.data() + b_offset;
+    uint8_t       *out_ptr = out.data() + o_offset;
+
+    const int   input_offset   = -in.quantization_info().offset;
+    const float input_scale    = in.quantization_info().scale;
+    const int   weights_offset = -weights.quantization_info().offset;
+    const float weights_scale  = weights.quantization_info().scale;
+    const int   output_offset  = out.quantization_info().offset;
+    const float output_scale   = out.quantization_info().scale;
+
+    int         output_multiplier = 0;
+    int         output_shift      = 0;
+    const float multiplier        = input_scale * weights_scale / output_scale;
+    arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+    const int half_width_weights_start  = width_weights / 2;
+    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
+    const int half_height_weights_start = height_weights / 2;
+    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
+
+    // Reset accumulator
+    int32_t acc(0);
+
+    // Compute a 2D convolution for each IFM and accumulate the result
+    for(int ifm = 0; ifm < depth_in; ++ifm)
+    {
+        // Compute the offset for the input slice
+        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
+
+        // Compute 2D convolution
+        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
+        {
+            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
+            {
+                // Check if the pixel is out-of-bound
+                if(is_valid_pixel(xi + xk * dilation_x, 0, width_in) && is_valid_pixel(yi + yk * dilation_y, 0, height_in))
+                {
+                    const int idx = xk + half_width_weights_start;
+                    const int idy = yk + half_height_weights_start;
+
+                    const uint8_t i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
+                    const uint8_t w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
+
+                    acc += (i_value + input_offset) * (w_value + weights_offset);
+                }
+            }
+        }
+    }
+
+    // Accumulate the bias
+    acc += (*b_ptr);
+
+    acc = validation::asymm_rounding_divide_by_pow2(validation::asymm_int_mult(acc, output_multiplier), output_shift);
+    acc += output_offset;
+    acc = utility::clamp<int32_t>(acc, 0, 255);
+
+    // Store the result
+    *out_ptr = acc;
+}
+} // namespace detail
+} // namespace convolution_3d
+} // namespace test
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__ */

diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index b7ed2f5..fe558ba 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp

@@ -25,6 +25,8 @@
 
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Convolution3d.h"
+#include "tests/validation/reference/Permute.h"
 #include "tests/validation/reference/Utils.h"
 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
 
@@ -42,193 +44,12 @@
 {
 namespace
 {
-inline bool is_valid_pixel(int i, int min, int max)
-{
-    return (i >= min && i < max);
-}
-
-// 3D convolution for floating point type
-template < typename T, typename TB, typename std::enable_if < is_floating_point<T>::value &&is_floating_point<TB>::value, int >::type = 0 >
-void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
-                   int i_offset, int w_offset, int b_offset, int o_offset,
-                   int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights)
-{
-    const T *in_ptr  = in.data() + i_offset;
-    const T *w_ptr   = weights.data() + w_offset;
-    const TB *b_ptr   = bias.data() + b_offset;
-    T        *out_ptr = out.data() + o_offset;
-
-    const int half_width_weights_start  = width_weights / 2;
-    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
-    const int half_height_weights_start = height_weights / 2;
-    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
-
-    // Reset accumulator
-    T acc(0);
-
-    // Compute a 2D convolution for each IFM and accumulate the result
-    for(int ifm = 0; ifm < depth_in; ++ifm)
-    {
-        // Compute the offset for the input slice
-        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
-
-        // Compute 2D convolution
-        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
-        {
-            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
-            {
-                // Check if the pixel is out-of-bound
-                if(is_valid_pixel(xi + xk, 0, width_in) && is_valid_pixel(yi + yk, 0, height_in))
-                {
-                    const int idx = xk + half_width_weights_start;
-                    const int idy = yk + half_height_weights_start;
-
-                    const T i_value = in_ptr[offset_slice_in + xk + yk * width_in];
-                    const T w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
-
-                    acc += i_value * w_value;
-                }
-            }
-        }
-    }
-
-    // Accumulate the bias and store the result
-    *out_ptr = acc + (*b_ptr);
-}
-
-// 3D convolution for fixed point type
-template < typename T, typename TB, typename std::enable_if < std::is_integral<T>::value &&std::is_integral<TB>::value, int >::type = 0 >
-void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
-                   int i_offset, int w_offset, int b_offset, int o_offset,
-                   int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights)
-{
-    const T *in_ptr               = in.data() + i_offset;
-    const T *w_ptr                = weights.data() + w_offset;
-    const T *b_ptr                = bias.data() + b_offset;
-    T       *out_ptr              = out.data() + o_offset;
-    int      fixed_point_position = in.fixed_point_position();
-
-    const int half_width_weights_start  = width_weights / 2;
-    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
-    const int half_height_weights_start = height_weights / 2;
-    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
-
-    using namespace fixed_point_arithmetic;
-    using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
-
-    // Reset accumulator
-    fixed_point<promoted_type> acc(0, fixed_point_position);
-
-    // Compute a 2D convolution for each IFM and accumulate the result
-    for(int ifm = 0; ifm < depth_in; ++ifm)
-    {
-        // Compute the offset for the input slice
-        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
-
-        // Compute 2D convolution
-        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
-        {
-            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
-            {
-                // Check if the pixel is out-of-bound
-                if(is_valid_pixel(xi + xk, 0, width_in) && is_valid_pixel(yi + yk, 0, height_in))
-                {
-                    const int idx = xk + half_width_weights_start;
-                    const int idy = yk + half_height_weights_start;
-
-                    const fixed_point<promoted_type> i_value(in_ptr[offset_slice_in + xk + yk * width_in], fixed_point_position, true);
-                    const fixed_point<promoted_type> w_value(w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights], fixed_point_position, true);
-                    const fixed_point<promoted_type> iw = i_value * w_value;
-                    acc                                 = iw + acc;
-                }
-            }
-        }
-    }
-
-    // Get the bias
-    const fixed_point<promoted_type> b(*b_ptr, fixed_point_position, true);
-
-    // Accumulate the bias and covert back
-    acc = acc + b;
-    fixed_point<T> res(acc);
-    *out_ptr = res.raw();
-}
-
-// 3D convolution for QASYMM8 type
-template <>
-void convolution3d(const SimpleTensor<uint8_t> &in, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<uint8_t> &out,
-                   int i_offset, int w_offset, int b_offset, int o_offset,
-                   int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights)
-{
-    const uint8_t *in_ptr  = in.data() + i_offset;
-    const uint8_t *w_ptr   = weights.data() + w_offset;
-    const int32_t *b_ptr   = bias.data() + b_offset;
-    uint8_t       *out_ptr = out.data() + o_offset;
-
-    const int   input_offset   = -in.quantization_info().offset;
-    const float input_scale    = in.quantization_info().scale;
-    const int   weights_offset = -weights.quantization_info().offset;
-    const float weights_scale  = weights.quantization_info().scale;
-    const int   output_offset  = out.quantization_info().offset;
-    const float output_scale   = out.quantization_info().scale;
-
-    int         output_multiplier = 0;
-    int         output_shift      = 0;
-    const float multiplier        = input_scale * weights_scale / output_scale;
-    arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
-    const int half_width_weights_start  = width_weights / 2;
-    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
-    const int half_height_weights_start = height_weights / 2;
-    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
-
-    // Reset accumulator
-    int32_t acc(0);
-
-    // Compute a 2D convolution for each IFM and accumulate the result
-    for(int ifm = 0; ifm < depth_in; ++ifm)
-    {
-        // Compute the offset for the input slice
-        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
-
-        // Compute 2D convolution
-        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
-        {
-            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
-            {
-                // Check if the pixel is out-of-bound
-                if(is_valid_pixel(xi + xk, 0, width_in) && is_valid_pixel(yi + yk, 0, height_in))
-                {
-                    const int idx = xk + half_width_weights_start;
-                    const int idy = yk + half_height_weights_start;
-
-                    const uint8_t i_value = in_ptr[offset_slice_in + xk + yk * width_in];
-                    const uint8_t w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
-
-                    acc += (i_value + input_offset) * (w_value + weights_offset);
-                }
-            }
-        }
-    }
-
-    // Accumulate the bias
-    acc += (*b_ptr);
-
-    acc = asymm_rounding_divide_by_pow2(asymm_int_mult(acc, output_multiplier), output_shift);
-    acc += output_offset;
-    acc = utility::clamp<int32_t>(acc, 0, 255);
-
-    // Store the result
-    *out_ptr = acc;
-}
 } // namespace
 
 template <typename T, typename TB>
-SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info)
+SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, const PadStrideInfo &info,
+                                       const Size2D &dilation)
 {
-    // Create reference
-    SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
-
     // Compute reference
     const int width_in       = src.shape().x();
     const int height_in      = src.shape().y();
@@ -244,10 +65,10 @@
     const int stride_xi      = info.stride().first;
     const int stride_yi      = info.stride().second;
 
-    auto output_wh = scaled_dimensions(width_in, height_in, width_weights, height_weights, info);
+    auto output_wh = scaled_dimensions(width_in, height_in, width_weights, height_weights, info, dilation);
 
-    const int start_xi    = width_weights / 2 - pad_left;
-    const int start_yi    = height_weights / 2 - pad_top;
+    const int start_xi    = (dilation.x() * (width_weights - 1) + 1) / 2 - pad_left;
+    const int start_yi    = (dilation.y() * (height_weights - 1) + 1) / 2 - pad_top;
     const int end_xi      = output_wh.first * stride_xi;
     const int end_yi      = output_wh.second * stride_yi;
     const int num_batches = src.shape().total_size() / (width_in * height_in * depth_in);
@@ -270,11 +91,11 @@
                     ARM_COMPUTE_ASSERT(yo < height_out);
 
                     // Compute 3D convolution
-                    convolution3d(src, weights, bias, dst,
-                                  offset_in, ofm * width_weights * height_weights * depth_weights, ofm, offset_out,
-                                  xi, yi,
-                                  width_in, height_in, depth_in,
-                                  width_weights, height_weights);
+                    convolution_3d::detail::convolution3d(src, weights, bias, dst,
+                                                          offset_in, ofm * width_weights * height_weights * depth_weights, ofm, offset_out,
+                                                          xi, yi,
+                                                          width_in, height_in, depth_in,
+                                                          width_weights, height_weights, dilation.x(), dilation.y());
                 }
             }
         }
@@ -282,18 +103,38 @@
 
     return dst;
 }
+template <typename T, typename TB>
+SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
+                                  const Size2D &dilation)
+{
+    // Create reference
+    SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
+
+    if(src.data_layout() == DataLayout::NHWC)
+    {
+        SimpleTensor<T> src_nchw     = reference::permute<T>(src, PermutationVector(1U, 2U, 0U));
+        SimpleTensor<T> weights_nchw = reference::permute<T>(weights, PermutationVector(1U, 2U, 0U));
+        SimpleTensor<T> dst_nchw     = reference::permute<T>(dst, PermutationVector(1U, 2U, 0U));
+
+        return reference::permute<T>(convolution_layer_nchw(src_nchw, weights_nchw, bias, dst_nchw, info, dilation), PermutationVector(2U, 0U, 1U));
+    }
+    else
+    {
+        return convolution_layer_nchw(src, weights, bias, dst, info, dilation);
+    }
+}
 
 template SimpleTensor<float> convolution_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
-                                               const PadStrideInfo &info);
+                                               const PadStrideInfo &info, const Size2D &dilation);
 template SimpleTensor<half> convolution_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, const TensorShape &output_shape,
-                                              const PadStrideInfo &info);
+                                              const PadStrideInfo &info, const Size2D &dilation);
 template SimpleTensor<qint8_t> convolution_layer(const SimpleTensor<qint8_t> &src, const SimpleTensor<qint8_t> &weights, const SimpleTensor<qint8_t> &bias, const TensorShape &output_shape,
-                                                 const PadStrideInfo &info);
+                                                 const PadStrideInfo &info, const Size2D &dilation);
 template SimpleTensor<qint16_t> convolution_layer(const SimpleTensor<qint16_t> &src, const SimpleTensor<qint16_t> &weights, const SimpleTensor<qint16_t> &bias, const TensorShape &output_shape,
-                                                  const PadStrideInfo &info);
+                                                  const PadStrideInfo &info, const Size2D &dilation);
 template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
-                                                 const PadStrideInfo &info);
+                                                 const PadStrideInfo &info, const Size2D &dilation);
 } // namespace reference
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file

diff --git a/tests/validation/reference/ConvolutionLayer.h b/tests/validation/reference/ConvolutionLayer.h
index 57455ba..ff3b153 100644
--- a/tests/validation/reference/ConvolutionLayer.h
+++ b/tests/validation/reference/ConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,8 @@
 namespace reference
 {
 template <typename T, typename TB>
-SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info);
+SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
+                                  const Size2D &dilation = Size2D(1U, 1U));
 } // namespace reference
 } // namespace validation
 } // namespace test

diff --git a/tests/validation/reference/Copy.cpp b/tests/validation/reference/Copy.cpp
new file mode 100644
index 0000000..dc519a4
--- /dev/null
+++ b/tests/validation/reference/Copy.cpp

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Copy.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> copy(const SimpleTensor<T> &src, const TensorShape &output_shape)
+{
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(src.shape(), output_shape);
+
+    SimpleTensor<T> dst(output_shape, src.data_type());
+    std::copy_n(src.data(), src.num_elements(), dst.data());
+    return dst;
+}
+
+template SimpleTensor<uint8_t> copy(const SimpleTensor<uint8_t> &src, const TensorShape &output_shape);
+template SimpleTensor<int8_t> copy(const SimpleTensor<int8_t> &src, const TensorShape &output_shape);
+template SimpleTensor<uint16_t> copy(const SimpleTensor<uint16_t> &src, const TensorShape &output_shape);
+template SimpleTensor<int16_t> copy(const SimpleTensor<int16_t> &src, const TensorShape &output_shape);
+template SimpleTensor<uint32_t> copy(const SimpleTensor<uint32_t> &src, const TensorShape &output_shape);
+template SimpleTensor<int32_t> copy(const SimpleTensor<int32_t> &src, const TensorShape &output_shape);
+template SimpleTensor<half> copy(const SimpleTensor<half> &src, const TensorShape &output_shape);
+template SimpleTensor<float> copy(const SimpleTensor<float> &src, const TensorShape &output_shape);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/Copy.h b/tests/validation/reference/Copy.h
new file mode 100644
index 0000000..362af03
--- /dev/null
+++ b/tests/validation/reference/Copy.h

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_COPY_H__
+#define __ARM_COMPUTE_TEST_COPY_H__
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> copy(const SimpleTensor<T> &src, const TensorShape &output_shape);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_COPY_H__ */

diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.cpp b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
index b2a7067..10c617e 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.cpp

@@ -50,9 +50,9 @@
  *
  */
 template <typename T, typename TB>
-SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info)
+SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info,
+                                      unsigned int depth_multiplier)
 {
-    // Create reference
     SimpleTensor<T> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position() };
 
     // Compute reference
@@ -77,33 +77,39 @@
     const int maximum_x = input_width + pad_left - filter_half_width + pad_right - filter_half_width;
     const int maximum_y = input_height + pad_top - filter_half_height + pad_bottom - filter_half_height;
 
+    const T border_value(0);
+
     int out_pos = 0;
     for(int r = 0; r < num_batches; ++r)
     {
         for(int z = 0; z < input_depth; ++z)
         {
-            for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
+            for(unsigned int m = 0; m < depth_multiplier; ++m)
             {
-                for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
-                {
-                    Coordinates coords(static_cast<int>(x), static_cast<int>(y), static_cast<int>(z), static_cast<int>(r));
-                    size_t      filter_offset = filter_plane * z;
+                const int out_z = z * depth_multiplier + m;
 
-                    T val(0);
-                    for(int j = y - filter_half_height; j <= static_cast<int>(y + filter_half_height); ++j)
+                for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
+                {
+                    for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
                     {
-                        for(int i = x - filter_half_width; i <= static_cast<int>(x + filter_half_width); ++i)
+                        Coordinates coords(static_cast<int>(x), static_cast<int>(y), static_cast<int>(z), static_cast<int>(r));
+                        size_t      filter_offset = filter_plane * out_z;
+
+                        T val(0);
+                        for(int j = y - filter_half_height; j <= static_cast<int>(y + filter_half_height); ++j)
                         {
-                            coords.set(0, i);
-                            coords.set(1, j);
-                            T border_value(0);
-                            val += *(weights.data() + filter_offset) * tensor_elem_at(src, coords, BorderMode::CONSTANT, border_value);
-                            ++filter_offset;
+                            for(int i = x - filter_half_width; i <= static_cast<int>(x + filter_half_width); ++i)
+                            {
+                                coords.set(0, i);
+                                coords.set(1, j);
+
+                                val += *(weights.data() + filter_offset) * tensor_elem_at(src, coords, BorderMode::CONSTANT, border_value);
+                                ++filter_offset;
+                            }
                         }
+
+                        dst[out_pos++] = saturate_cast<T>(val + *static_cast<const TB *>(biases(Coordinates(out_z))));
                     }
-                    coords.set(0, x);
-                    coords.set(1, y);
-                    dst[out_pos++] = saturate_cast<T>(val + *static_cast<const TB *>(biases(Coordinates(z))));
                 }
             }
         }
@@ -114,11 +120,11 @@
 
 template <>
 SimpleTensor<uint8_t> depthwise_convolution(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &biases, const TensorShape &dst_shape,
-                                            const PadStrideInfo &conv_info)
+                                            const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
-    // Create reference
     SimpleTensor<uint8_t> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
 
+    // Create reference
     const int   input_offset   = -src.quantization_info().offset;
     const float input_scale    = src.quantization_info().scale;
     const int   weights_offset = -weights.quantization_info().offset;
@@ -158,35 +164,40 @@
     {
         for(int z = 0; z < input_depth; ++z)
         {
-            int32_t bias_val = *static_cast<const int32_t *>(biases(Coordinates(z)));
-            for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
+            for(unsigned int m = 0; m < depth_multiplier; ++m)
             {
-                for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
+                const int     out_z    = z * depth_multiplier + m;
+                const int32_t bias_val = *static_cast<const int32_t *>(biases(Coordinates(out_z)));
+
+                for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
                 {
-                    Coordinates coords(x, y, z, r);
-                    int         filter_offset = filter_plane * z;
-
-                    int32_t val = 0;
-                    for(int j = y - filter_half_height; j <= (y + filter_half_height); ++j)
+                    for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
                     {
-                        for(int i = x - filter_half_width; i <= (x + filter_half_width); ++i)
-                        {
-                            coords.set(0, i);
-                            coords.set(1, j);
-                            auto    in_val = tensor_elem_at<uint8_t>(src, coords, BorderMode::CONSTANT, -input_offset);
-                            uint8_t w_val  = *(weights.data() + filter_offset);
-                            val += (in_val + input_offset) * (w_val + weights_offset);
-                            ++filter_offset;
-                        }
-                    }
-                    val += bias_val;
-                    val = asymm_rounding_divide_by_pow2(asymm_int_mult(val, output_multiplier), output_shift);
-                    val += output_offset;
-                    val = std::max<int32_t>(val, 0);
-                    val = std::min<int32_t>(val, 255);
+                        Coordinates coords(x, y, z, r);
+                        int         filter_offset = filter_plane * out_z;
 
-                    // Store the result
-                    dst[out_pos++] = val;
+                        int32_t val = 0;
+                        for(int j = y - filter_half_height; j <= (y + filter_half_height); ++j)
+                        {
+                            for(int i = x - filter_half_width; i <= (x + filter_half_width); ++i)
+                            {
+                                coords.set(0, i);
+                                coords.set(1, j);
+                                const auto    in_val = tensor_elem_at<uint8_t>(src, coords, BorderMode::CONSTANT, -input_offset);
+                                const uint8_t w_val  = *(weights.data() + filter_offset);
+                                val += (in_val + input_offset) * (w_val + weights_offset);
+                                ++filter_offset;
+                            }
+                        }
+                        val += bias_val;
+                        val = asymm_rounding_divide_by_pow2(asymm_int_mult(val, output_multiplier), output_shift);
+                        val += output_offset;
+                        val = std::max<int32_t>(val, 0);
+                        val = std::min<int32_t>(val, 255);
+
+                        // Store the result
+                        dst[out_pos++] = val;
+                    }
                 }
             }
         }
@@ -196,10 +207,10 @@
 }
 
 template SimpleTensor<float> depthwise_convolution(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &biases, const TensorShape &dst_shape,
-                                                   const PadStrideInfo &conv_info);
+                                                   const PadStrideInfo &conv_info, unsigned int depth_multiplier);
 
 template SimpleTensor<half> depthwise_convolution(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &biases, const TensorShape &dst_shape,
-                                                  const PadStrideInfo &conv_info);
+                                                  const PadStrideInfo &conv_info, unsigned int depth_multiplier);
 } // namespace reference
 } // namespace validation
 } // namespace test

diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.h b/tests/validation/reference/DepthwiseConvolutionLayer.h
index df743a5..bab3387 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.h
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,8 @@
 namespace reference
 {
 template <typename T, typename TB>
-SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info);
+SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info,
+                                      unsigned int depth_multiplier);
 } // namespace reference
 } // namespace validation
 } // namespace test

diff --git a/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp b/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp
index ca6c168..8bc6ddb 100644
--- a/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@
                                                       const SimpleTensor<T> &pointwise_biases, const TensorShape &dst_shape, const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
 {
     // Compute reference
-    SimpleTensor<T> depthwise_out = depthwise_convolution(src, depthwise_weights, depthwise_biases, depthwise_out_shape, depthwise_conv_info);
+    SimpleTensor<T> depthwise_out = depthwise_convolution(src, depthwise_weights, depthwise_biases, depthwise_out_shape, depthwise_conv_info, 1);
     SimpleTensor<T> dst           = convolution_layer(depthwise_out, pointwise_weights, pointwise_biases, dst_shape, pointwise_conv_info);
 
     return dst;

diff --git a/tests/validation/reference/FlattenLayer.cpp b/tests/validation/reference/FlattenLayer.cpp
index 611701d..44f4d93 100644
--- a/tests/validation/reference/FlattenLayer.cpp
+++ b/tests/validation/reference/FlattenLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,12 +34,8 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> flatten_layer(const SimpleTensor<T> &src)
+SimpleTensor<T> flatten_layer(const SimpleTensor<T> &src, const TensorShape &shape_flatten)
 {
-    TensorShape shape_flatten(src.shape());
-    shape_flatten.set(0, src.shape()[0] * src.shape()[1] * src.shape()[2]);
-    shape_flatten.remove_dimension(1);
-    shape_flatten.remove_dimension(1);
     SimpleTensor<T> dst(shape_flatten, src.data_type(), 1, src.fixed_point_position());
 
     // Note: Since the reference implementation does not use padding bytes, we can copy directly the content of the source tensor
@@ -48,10 +44,10 @@
     return dst;
 }
 
-template SimpleTensor<float> flatten_layer(const SimpleTensor<float> &src);
-template SimpleTensor<half> flatten_layer(const SimpleTensor<half> &src);
-template SimpleTensor<qint8_t> flatten_layer(const SimpleTensor<qint8_t> &src);
-template SimpleTensor<qint16_t> flatten_layer(const SimpleTensor<qint16_t> &src);
+template SimpleTensor<float> flatten_layer(const SimpleTensor<float> &src, const TensorShape &shape_flatten);
+template SimpleTensor<half> flatten_layer(const SimpleTensor<half> &src, const TensorShape &shape_flatten);
+template SimpleTensor<qint8_t> flatten_layer(const SimpleTensor<qint8_t> &src, const TensorShape &shape_flatten);
+template SimpleTensor<qint16_t> flatten_layer(const SimpleTensor<qint16_t> &src, const TensorShape &shape_flatten);
 } // namespace reference
 } // namespace validation
 } // namespace test

diff --git a/tests/validation/reference/FlattenLayer.h b/tests/validation/reference/FlattenLayer.h
index b1286fe..5ccd429 100644
--- a/tests/validation/reference/FlattenLayer.h
+++ b/tests/validation/reference/FlattenLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,7 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> flatten_layer(const SimpleTensor<T> &src);
+SimpleTensor<T> flatten_layer(const SimpleTensor<T> &src, const TensorShape &shape_flatten);
 } // namespace reference
 } // namespace validation
 } // namespace test

diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp
index 77d025e..f9dcfcb 100644
--- a/tests/validation/reference/GEMM.cpp
+++ b/tests/validation/reference/GEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,23 +41,44 @@
     SimpleTensor<T> dst{ c.shape(), c.data_type(), 1, c.fixed_point_position() };
 
     // Compute reference
-    const int M = dst.shape().y();
-    const int N = dst.shape().x();
+    const int M = a.shape().y();
+    const int N = b.shape().x();
     const int K = a.shape().x();
+    const int D = a.shape().z(); // Number of matrices in a batch
+    const int W = a.shape()[3];  // Number of batched-gemm (Winograd case)
 
-    for(int row = 0; row < M; ++row)
+    const int a_stride_z = K * M;
+    const int a_stride_w = K * M * D;
+
+    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    const int c_stride_z = N * M;
+    const int c_stride_w = N * M * D;
+
+    for(int w = 0; w < W; ++w)
     {
-        for(int col = 0; col < N; ++col)
+        for(int depth = 0; depth < D; ++depth)
         {
-            T acc(0);
+            const int base_addr_a = depth * a_stride_z + w * a_stride_w;
+            const int base_addr_b = depth * b_stride_z + w * b_stride_w;
+            const int base_addr_c = depth * c_stride_z + w * c_stride_w;
 
-            for(int k = 0; k < K; ++k)
+            for(int row = 0; row < M; ++row)
             {
-                acc += a[row * K + k] * b[k * N + col];
-            }
+                for(int col = 0; col < N; ++col)
+                {
+                    T acc(0);
 
-            // Finalize the result: alpha * A * B + beta * C
-            dst[col + row * N] = alpha * acc + beta * c[col + row * N];
+                    for(int k = 0; k < K; ++k)
+                    {
+                        acc += a[base_addr_a + k + row * K] * b[base_addr_b + col + k * N];
+                    }
+
+                    // Finalize the result: alpha * A * B + beta * C
+                    dst[base_addr_c + col + row * N] = alpha * acc + beta * c[base_addr_c + col + row * N];
+                }
+            }
         }
     }
 
@@ -75,37 +96,58 @@
     // Compute reference
     using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
 
-    const int M                    = dst.shape().y();
-    const int N                    = dst.shape().x();
-    const int K                    = a.shape().x();
-    const int fixed_point_position = a.fixed_point_position();
+    const int M = dst.shape().y();
+    const int N = dst.shape().x();
+    const int K = a.shape().x();
+    const int D = a.shape().z(); // Number of matrices in a batch
+    const int W = a.shape()[3];  // Number of batched-gemm (Winograd case)
 
+    const int a_stride_z = K * M;
+    const int a_stride_w = K * M * D;
+
+    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    const int c_stride_z = N * M;
+    const int c_stride_w = N * M * D;
+
+    const int            fixed_point_position = a.fixed_point_position();
     const fixed_point<T> alpha_q(alpha, fixed_point_position);
     const fixed_point<T> beta_q(beta, fixed_point_position);
 
-    for(int row = 0; row < M; ++row)
+    for(int w = 0; w < W; ++w)
     {
-        for(int col = 0; col < N; ++col)
+        for(int depth = 0; depth < D; ++depth)
         {
-            fixed_point<promoted_type> acc_q(0, fixed_point_position);
+            const int base_addr_a = depth * a_stride_z + w * a_stride_w;
+            const int base_addr_b = depth * b_stride_z + w * b_stride_w;
+            const int base_addr_c = depth * c_stride_z + w * c_stride_w;
 
-            for(int k = 0; k < K; ++k)
+            for(int row = 0; row < M; ++row)
             {
-                const fixed_point<promoted_type> a0_q(a[row * K + k], fixed_point_position, true);
-                const fixed_point<promoted_type> b0_q(b[k * N + col], fixed_point_position, true);
+                for(int col = 0; col < N; ++col)
+                {
+                    fixed_point<promoted_type> acc_q(0, fixed_point_position);
 
-                acc_q = acc_q + (a0_q * b0_q);
+                    for(int k = 0; k < K; ++k)
+                    {
+                        const fixed_point<promoted_type> a0_q(a[base_addr_a + row * K + k], fixed_point_position, true);
+                        const fixed_point<promoted_type> b0_q(b[base_addr_b + k * N + col], fixed_point_position, true);
+
+                        acc_q = acc_q + (a0_q * b0_q);
+                    }
+
+                    // Finalize the result: alpha * A * B + beta * C
+                    const fixed_point<T> c0_q(c[base_addr_c + col + row * N], fixed_point_position, true);
+
+                    fixed_point<T> res_q(acc_q);
+                    res_q = alpha_q * res_q;
+                    res_q = res_q + (beta_q * c0_q);
+
+                    // Store the result
+                    dst[base_addr_c + col + row * N] = res_q.raw();
+                }
             }
-
-            // Finalize the result: alpha * A * B + beta * C
-            const fixed_point<T> c0_q(c[col + row * N], fixed_point_position, true);
-
-            fixed_point<T> res_q(acc_q);
-            res_q = alpha_q * res_q;
-            res_q = res_q + (beta_q * c0_q);
-
-            // Store the result
-            dst[col + row * N] = res_q.raw();
         }
     }
 

diff --git a/tests/validation/reference/GaussianPyramidHalf.cpp b/tests/validation/reference/GaussianPyramidHalf.cpp
index 0a68ded..7d5eb07 100644
--- a/tests/validation/reference/GaussianPyramidHalf.cpp
+++ b/tests/validation/reference/GaussianPyramidHalf.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/HOGDescriptor.cpp b/tests/validation/reference/HOGDescriptor.cpp
index 105eb83..ed22695 100644
--- a/tests/validation/reference/HOGDescriptor.cpp
+++ b/tests/validation/reference/HOGDescriptor.cpp

@@ -255,6 +255,8 @@
     return desc;
 }
 
+template void hog_orientation_binning(const SimpleTensor<int16_t> &mag, const SimpleTensor<uint8_t> &phase, SimpleTensor<float> &hog_space, const HOGInfo &hog_info);
+template void hog_block_normalization(SimpleTensor<float> &desc, const SimpleTensor<float> &hog_space, const HOGInfo &hog_info);
 template SimpleTensor<float> hog_descriptor(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value, const HOGInfo &hog_info);
 } // namespace reference
 } // namespace validation

diff --git a/tests/validation/reference/HOGDescriptor.h b/tests/validation/reference/HOGDescriptor.h
index e886445..6ea83fe 100644
--- a/tests/validation/reference/HOGDescriptor.h
+++ b/tests/validation/reference/HOGDescriptor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,12 @@
 {
 namespace reference
 {
+template <typename T, typename U, typename V>
+void hog_orientation_binning(const SimpleTensor<T> &mag, const SimpleTensor<U> &phase, SimpleTensor<V> &hog_space, const HOGInfo &hog_info);
+
+template <typename T>
+void hog_block_normalization(SimpleTensor<T> &desc, const SimpleTensor<T> &hog_space, const HOGInfo &hog_info);
+
 template <typename T, typename U>
 SimpleTensor<T> hog_descriptor(const SimpleTensor<U> &src, BorderMode border_mode, U constant_border_value, const HOGInfo &hog_info);
 } // namespace reference

diff --git a/tests/validation/reference/HOGDetector.cpp b/tests/validation/reference/HOGDetector.cpp
new file mode 100644
index 0000000..5a5ae37
--- /dev/null
+++ b/tests/validation/reference/HOGDetector.cpp

@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "HOGDetector.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+/** Computes the number of detection windows to iterate over in the feature vector. */
+Size2D num_detection_windows(const TensorShape &shape, const Size2D &window_step, const HOGInfo &hog_info)
+{
+    const size_t num_block_strides_width  = hog_info.detection_window_size().width / hog_info.block_stride().width;
+    const size_t num_block_strides_height = hog_info.detection_window_size().height / hog_info.block_stride().height;
+
+    return Size2D(floor_to_multiple(shape.x() - num_block_strides_width, window_step.width) + window_step.width,
+                  floor_to_multiple(shape.y() - num_block_strides_height, window_step.height) + window_step.height);
+}
+} // namespace
+
+template <typename T>
+std::vector<DetectionWindow> hog_detector(const SimpleTensor<T> &src, const std::vector<T> &descriptor, unsigned int max_num_detection_windows,
+                                          const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold, uint16_t idx_class)
+{
+    ARM_COMPUTE_ERROR_ON_MSG((detection_window_stride.width % hog_info.block_stride().width != 0),
+                             "Detection window stride width must be multiple of block stride width");
+    ARM_COMPUTE_ERROR_ON_MSG((detection_window_stride.height % hog_info.block_stride().height != 0),
+                             "Detection window stride height must be multiple of block stride height");
+
+    // Create vector for identifying each detection window
+    std::vector<DetectionWindow> windows;
+
+    // Calculate detection window step
+    const Size2D window_step(detection_window_stride.width / hog_info.block_stride().width,
+                             detection_window_stride.height / hog_info.block_stride().height);
+
+    // Calculate number of detection windows
+    const Size2D num_windows = num_detection_windows(src.shape(), window_step, hog_info);
+
+    // Calculate detection window and row offsets in feature vector
+    const size_t src_offset_x   = window_step.width * hog_info.num_bins() * hog_info.num_cells_per_block().area();
+    const size_t src_offset_y   = window_step.height * hog_info.num_bins() * hog_info.num_cells_per_block().area() * src.shape().x();
+    const size_t src_offset_row = src.num_channels() * src.shape().x();
+
+    // Calculate detection window attributes
+    const Size2D       num_block_positions_per_detection_window = hog_info.num_block_positions_per_image(hog_info.detection_window_size());
+    const unsigned int num_bins_per_descriptor_x                = num_block_positions_per_detection_window.width * src.num_channels();
+    const unsigned int num_blocks_per_descriptor_y              = num_block_positions_per_detection_window.height;
+
+    ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog_info.descriptor_size());
+
+    size_t win_id = 0;
+
+    // Traverse feature vector in detection window steps
+    for(auto win_y = 0u, offset_y = 0u; win_y < num_windows.height; win_y += window_step.height, offset_y += src_offset_y)
+    {
+        for(auto win_x = 0u, offset_x = 0u; win_x < num_windows.width; win_x += window_step.width, offset_x += src_offset_x)
+        {
+            // Reset the score
+            float score = 0.0f;
+
+            // Traverse detection window
+            for(auto y = 0u, offset_row = 0u; y < num_blocks_per_descriptor_y; ++y, offset_row += src_offset_row)
+            {
+                const int bin_offset = y * num_bins_per_descriptor_x;
+
+                for(auto x = 0u; x < num_bins_per_descriptor_x; ++x)
+                {
+                    // Compute Linear SVM
+                    const float a = src[x + offset_x + offset_y + offset_row];
+                    const float b = descriptor[x + bin_offset];
+                    score += a * b;
+                }
+            }
+
+            // Add the bias. The bias is located at the position (descriptor_size() - 1)
+            score += descriptor[num_bins_per_descriptor_x * num_blocks_per_descriptor_y];
+
+            if(score > threshold)
+            {
+                DetectionWindow window;
+
+                if(win_id++ < max_num_detection_windows)
+                {
+                    window.x         = win_x * hog_info.block_stride().width;
+                    window.y         = win_y * hog_info.block_stride().height;
+                    window.width     = hog_info.detection_window_size().width;
+                    window.height    = hog_info.detection_window_size().height;
+                    window.idx_class = idx_class;
+                    window.score     = score;
+
+                    windows.push_back(window);
+                }
+            }
+        }
+    }
+
+    return windows;
+}
+
+template std::vector<DetectionWindow> hog_detector(const SimpleTensor<float> &src, const std::vector<float> &descriptor, unsigned int max_num_detection_windows,
+                                                   const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold, uint16_t idx_class);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/HOGDetector.h b/tests/validation/reference/HOGDetector.h
new file mode 100644
index 0000000..e88acb8
--- /dev/null
+++ b/tests/validation/reference/HOGDetector.h

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_HOG_DETECTOR_H__
+#define __ARM_COMPUTE_TEST_HOG_DETECTOR_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "tests/SimpleTensor.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+std::vector<DetectionWindow> hog_detector(const SimpleTensor<T> &src, const std::vector<T> &descriptor, unsigned int max_num_detection_windows,
+                                          const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold = 0.0f, uint16_t idx_class = 0);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_HOG_DETECTOR_H__ */

diff --git a/tests/validation/reference/HOGMultiDetection.cpp b/tests/validation/reference/HOGMultiDetection.cpp
new file mode 100644
index 0000000..2f5e439
--- /dev/null
+++ b/tests/validation/reference/HOGMultiDetection.cpp

@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "HOGMultiDetection.h"
+
+#include "Derivative.h"
+#include "HOGDescriptor.h"
+#include "HOGDetector.h"
+#include "Magnitude.h"
+#include "Phase.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+void validate_models(const std::vector<HOGInfo> &models)
+{
+    ARM_COMPUTE_ERROR_ON(0 == models.size());
+
+    for(size_t i = 1; i < models.size(); ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(models[0].phase_type() != models[i].phase_type(),
+                                 "All HOG parameters must have the same phase type");
+
+        ARM_COMPUTE_ERROR_ON_MSG(models[0].normalization_type() != models[i].normalization_type(),
+                                 "All HOG parameters must have the same normalization_type");
+
+        ARM_COMPUTE_ERROR_ON_MSG((models[0].l2_hyst_threshold() != models[i].l2_hyst_threshold()) && (models[0].normalization_type() == arm_compute::HOGNormType::L2HYS_NORM),
+                                 "All HOG parameters must have the same l2 hysteresis threshold if you use L2 hysteresis normalization type");
+    }
+}
+} // namespace
+
+void detection_windows_non_maxima_suppression(std::vector<DetectionWindow> &multi_windows, float min_distance)
+{
+    const size_t num_candidates = multi_windows.size();
+    size_t       num_detections = 0;
+
+    // Sort by idx_class first and by score second
+    std::sort(multi_windows.begin(), multi_windows.end(), [](const DetectionWindow & lhs, const DetectionWindow & rhs)
+    {
+        if(lhs.idx_class < rhs.idx_class)
+        {
+            return true;
+        }
+        if(rhs.idx_class < lhs.idx_class)
+        {
+            return false;
+        }
+
+        // idx_classes are equal so compare by score
+        if(lhs.score > rhs.score)
+        {
+            return true;
+        }
+        if(rhs.score > lhs.score)
+        {
+            return false;
+        }
+
+        return false;
+    });
+
+    const float min_distance_pow2 = min_distance * min_distance;
+
+    // Euclidean distance
+    for(size_t i = 0; i < num_candidates; ++i)
+    {
+        if(0.0f != multi_windows.at(i).score)
+        {
+            DetectionWindow cur;
+            cur.x         = multi_windows.at(i).x;
+            cur.y         = multi_windows.at(i).y;
+            cur.width     = multi_windows.at(i).width;
+            cur.height    = multi_windows.at(i).height;
+            cur.idx_class = multi_windows.at(i).idx_class;
+            cur.score     = multi_windows.at(i).score;
+
+            // Store window
+            multi_windows.at(num_detections) = cur;
+            ++num_detections;
+
+            const float xc = cur.x + cur.width * 0.5f;
+            const float yc = cur.y + cur.height * 0.5f;
+
+            for(size_t k = i + 1; k < (num_candidates) && (cur.idx_class == multi_windows.at(k).idx_class); ++k)
+            {
+                const float xn = multi_windows.at(k).x + multi_windows.at(k).width * 0.5f;
+                const float yn = multi_windows.at(k).y + multi_windows.at(k).height * 0.5f;
+
+                const float dx = std::fabs(xn - xc);
+                const float dy = std::fabs(yn - yc);
+
+                if(dx < min_distance && dy < min_distance)
+                {
+                    const float d = dx * dx + dy * dy;
+
+                    if(d < min_distance_pow2)
+                    {
+                        // Invalidate detection window
+                        multi_windows.at(k).score = 0.0f;
+                    }
+                }
+            }
+        }
+    }
+
+    multi_windows.resize(num_detections);
+}
+
+template <typename T>
+std::vector<DetectionWindow> hog_multi_detection(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value,
+                                                 const std::vector<HOGInfo> &models, std::vector<std::vector<float>> descriptors,
+                                                 unsigned int max_num_detection_windows, float threshold, bool non_maxima_suppression, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON(descriptors.size() != models.size());
+    validate_models(models);
+
+    const size_t width      = src.shape().x();
+    const size_t height     = src.shape().y();
+    const size_t num_models = models.size();
+
+    // Initialize previous values
+    size_t prev_num_bins     = models[0].num_bins();
+    Size2D prev_cell_size    = models[0].cell_size();
+    Size2D prev_block_size   = models[0].block_size();
+    Size2D prev_block_stride = models[0].block_stride();
+
+    std::vector<size_t> input_orient_bin;
+    std::vector<size_t> input_hog_detect;
+    std::vector<std::pair<size_t, size_t>> input_block_norm;
+
+    input_orient_bin.push_back(0);
+    input_hog_detect.push_back(0);
+    input_block_norm.emplace_back(0, 0);
+
+    // Iterate through the number of models and check if orientation binning
+    // and block normalization steps can be skipped
+    for(size_t i = 1; i < num_models; ++i)
+    {
+        size_t cur_num_bins     = models[i].num_bins();
+        Size2D cur_cell_size    = models[i].cell_size();
+        Size2D cur_block_size   = models[i].block_size();
+        Size2D cur_block_stride = models[i].block_stride();
+
+        // Check if binning and normalization steps are required
+        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
+        {
+            prev_num_bins     = cur_num_bins;
+            prev_cell_size    = cur_cell_size;
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute orientation binning and block normalization. Update input to process
+            input_orient_bin.push_back(i);
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
+                || (cur_block_stride.height != prev_block_stride.height))
+        {
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute block normalization. Update input to process
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+
+        // Update input to process for hog detector
+        input_hog_detect.push_back(input_block_norm.size() - 1);
+    }
+
+    size_t num_orient_bin = input_orient_bin.size();
+    size_t num_block_norm = input_block_norm.size();
+    size_t num_hog_detect = input_hog_detect.size();
+
+    std::vector<SimpleTensor<float>> hog_spaces(num_orient_bin);
+    std::vector<SimpleTensor<float>> hog_norm_spaces(num_block_norm);
+
+    // Calculate derivative
+    SimpleTensor<int16_t> grad_x;
+    SimpleTensor<int16_t> grad_y;
+    std::tie(grad_x, grad_y) = derivative<int16_t>(src, border_mode, constant_border_value, GradientDimension::GRAD_XY);
+
+    // Calculate magnitude and phase
+    SimpleTensor<int16_t> _mag   = magnitude(grad_x, grad_y, MagnitudeType::L2NORM);
+    SimpleTensor<uint8_t> _phase = phase(grad_x, grad_y, models[0].phase_type());
+
+    // Calculate Tensors for the HOG space and orientation binning
+    for(size_t i = 0; i < num_orient_bin; ++i)
+    {
+        const size_t idx_multi_hog = input_orient_bin[i];
+
+        const size_t num_bins    = models[idx_multi_hog].num_bins();
+        const size_t num_cells_x = width / models[idx_multi_hog].cell_size().width;
+        const size_t num_cells_y = height / models[idx_multi_hog].cell_size().height;
+
+        // TensorShape of hog space
+        TensorShape hog_space_shape(num_cells_x, num_cells_y);
+
+        // Initialise HOG space
+        TensorInfo info_hog_space(hog_space_shape, num_bins, DataType::F32);
+        hog_spaces.at(i) = SimpleTensor<float>(info_hog_space.tensor_shape(), DataType::F32, info_hog_space.num_channels());
+
+        // For each cell create histogram based on magnitude and phase
+        hog_orientation_binning(_mag, _phase, hog_spaces[i], models[idx_multi_hog]);
+    }
+
+    // Calculate Tensors for the normalized HOG space and block normalization
+    for(size_t i = 0; i < num_block_norm; ++i)
+    {
+        const size_t idx_multi_hog  = input_block_norm[i].first;
+        const size_t idx_orient_bin = input_block_norm[i].second;
+
+        // Create tensor info for HOG descriptor
+        TensorInfo tensor_info(models[idx_multi_hog], src.shape().x(), src.shape().y());
+        hog_norm_spaces.at(i) = SimpleTensor<float>(tensor_info.tensor_shape(), DataType::F32, tensor_info.num_channels());
+
+        // Normalize histograms based on block size
+        hog_block_normalization(hog_norm_spaces[i], hog_spaces[idx_orient_bin], models[idx_multi_hog]);
+    }
+
+    std::vector<DetectionWindow> multi_windows;
+
+    // Calculate Detection Windows for HOG detector
+    for(size_t i = 0; i < num_hog_detect; ++i)
+    {
+        const size_t idx_block_norm = input_hog_detect[i];
+
+        // NOTE: Detection window stride fixed to block stride
+        const Size2D detection_window_stride = models[i].block_stride();
+
+        std::vector<DetectionWindow> windows = hog_detector(hog_norm_spaces[idx_block_norm], descriptors[i],
+                                                            max_num_detection_windows, models[i], detection_window_stride, threshold, i);
+
+        multi_windows.insert(multi_windows.end(), windows.begin(), windows.end());
+    }
+
+    // Suppress Non-maxima detection windows
+    if(non_maxima_suppression)
+    {
+        detection_windows_non_maxima_suppression(multi_windows, min_distance);
+    }
+
+    return multi_windows;
+}
+
+template std::vector<DetectionWindow> hog_multi_detection(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value,
+                                                          const std::vector<HOGInfo> &models, std::vector<std::vector<float>> descriptors,
+                                                          unsigned int max_num_detection_windows, float threshold, bool non_maxima_suppression, float min_distance);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/HOGMultiDetection.h b/tests/validation/reference/HOGMultiDetection.h
new file mode 100644
index 0000000..6d75bf4
--- /dev/null
+++ b/tests/validation/reference/HOGMultiDetection.h

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_H__
+#define __ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_H__
+
+#include "arm_compute/core/Types.h"
+#include "tests/SimpleTensor.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+std::vector<DetectionWindow> hog_multi_detection(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value,
+                                                 const std::vector<HOGInfo> &models, std::vector<std::vector<float>> descriptors,
+                                                 unsigned int max_num_detection_windows, float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_H__ */

diff --git a/tests/validation/reference/Im2Col.cpp b/tests/validation/reference/Im2Col.cpp
new file mode 100644
index 0000000..5685b60
--- /dev/null
+++ b/tests/validation/reference/Im2Col.cpp

@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Im2Col.h"
+
+#include "Permute.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Utils.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+void im2col_nchw(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NCHW);
+    // Create reference
+    const int pad_x         = conv_info.pad().first;
+    const int pad_y         = conv_info.pad().second;
+    const int stride_x      = conv_info.stride().first;
+    const int stride_y      = conv_info.stride().second;
+    const int kernel_width  = kernel_dims.width;
+    const int kernel_height = kernel_dims.height;
+    const int src_width     = src.shape().x();
+    const int src_height    = src.shape().y();
+    const int src_depth     = src.shape().z();
+    const int batches       = src.shape().total_size_upper(3);
+    const int pad_val       = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0;
+
+    int dst_idx = 0;
+    for(int b = 0; b < batches; ++b)
+    {
+        for(int y = -pad_y; y <= (src_height + pad_y - kernel_height); y += stride_y)
+        {
+            for(int x = -pad_x; x <= (src_width + pad_x - kernel_width); x += stride_x)
+            {
+                for(int z = 0; z < src_depth; ++z)
+                {
+                    for(int patch_y = y; patch_y < (y + kernel_height); ++patch_y)
+                    {
+                        for(int patch_x = x; patch_x < (x + kernel_width); ++patch_x)
+                        {
+                            dst[dst_idx++] = tensor_elem_at(src, Coordinates(patch_x, patch_y, z, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
+                        }
+                    }
+                }
+
+                if(has_bias)
+                {
+                    dst[dst_idx++] = static_cast<T>(1);
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void im2col_nhwc(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NHWC);
+    const int pad_x         = conv_info.pad().first;
+    const int pad_y         = conv_info.pad().second;
+    const int stride_x      = conv_info.stride().first;
+    const int stride_y      = conv_info.stride().second;
+    const int kernel_width  = kernel_dims.width;
+    const int kernel_height = kernel_dims.height;
+    const int src_width     = src.shape().y();
+    const int src_height    = src.shape().z();
+    const int src_depth     = src.shape().x();
+    const int batches       = src.shape().total_size_upper(3);
+    const int pad_val       = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0;
+    int       dst_idx       = 0;
+    for(int b = 0; b < batches; ++b)
+    {
+        for(int y = -pad_y; y <= (src_height + pad_y - kernel_height); y += stride_y)
+        {
+            for(int x = -pad_x; x <= (src_width + pad_x - kernel_width); x += stride_x)
+            {
+                for(int z = 0; z < src_depth; ++z)
+                {
+                    for(int patch_y = y; patch_y < (y + kernel_height); ++patch_y)
+                    {
+                        for(int patch_x = x; patch_x < (x + kernel_width); ++patch_x)
+                        {
+                            dst[dst_idx++] = tensor_elem_at(src, Coordinates(z, patch_x, patch_y, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
+                        }
+                    }
+                }
+
+                if(has_bias)
+                {
+                    dst[dst_idx++] = static_cast<T>(1);
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    switch(src.data_layout())
+    {
+        case DataLayout::NCHW:
+        {
+            im2col_nchw(src, dst, kernel_dims, conv_info, has_bias);
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            im2col_nhwc(src, dst, kernel_dims, conv_info, has_bias);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+        }
+    }
+}
+
+template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/Im2Col.h b/tests/validation/reference/Im2Col.h
new file mode 100644
index 0000000..5277171
--- /dev/null
+++ b/tests/validation/reference/Im2Col.h

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_IM2COL_H__
+#define __ARM_COMPUTE_TEST_IM2COL_H__
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_IM2COL_H__ */

diff --git a/tests/validation/reference/LocallyConnected.cpp b/tests/validation/reference/LocallyConnected.cpp
new file mode 100644
index 0000000..08e3f02
--- /dev/null
+++ b/tests/validation/reference/LocallyConnected.cpp

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "LocallyConnected.h"
+
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Convolution3d.h"
+#include "tests/validation/reference/Utils.h"
+
+#include "tests/framework/Asserts.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename TB>
+SimpleTensor<T> locally_connected(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info)
+{
+    // Create reference
+    SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
+
+    // Compute reference
+    const int width_in  = src.shape().x();
+    const int height_in = src.shape().y();
+    const int depth_in  = src.shape().z();
+
+    const int width_out  = dst.shape().x();
+    const int height_out = dst.shape().y();
+    const int depth_out  = dst.shape().z();
+
+    const int width_weights  = weights.shape().x();
+    const int height_weights = weights.shape().y();
+    const int depth_weights  = weights.shape().z();
+
+    const int pad_left  = info.pad_left();
+    const int pad_top   = info.pad_top();
+    const int stride_xi = info.stride().first;
+    const int stride_yi = info.stride().second;
+
+    auto output_wh = scaled_dimensions(width_in, height_in, width_weights, height_weights, info);
+
+    const int start_xi    = width_weights / 2 - pad_left;
+    const int start_yi    = height_weights / 2 - pad_top;
+    const int end_xi      = output_wh.first * stride_xi;
+    const int end_yi      = output_wh.second * stride_yi;
+    const int num_batches = src.shape().total_size() / (width_in * height_in * depth_in);
+
+    for(int r = 0; r < num_batches; ++r)
+    {
+        int count = 0;
+        for(int yi = start_yi; yi < start_yi + end_yi; yi += stride_yi)
+        {
+            for(int xi = start_xi; xi < start_xi + end_xi; xi += stride_xi)
+            {
+                for(int ofm = 0; ofm < depth_out; ++ofm)
+                {
+                    // Compute input and output offsets
+                    const int offset_in  = r * width_in * height_in * depth_in;
+                    const int xo         = (xi - start_xi) / stride_xi;
+                    const int yo         = (yi - start_yi) / stride_yi;
+                    const int offset_out = xo + yo * width_out + ofm * width_out * height_out + r * width_out * height_out * depth_out;
+
+                    ARM_COMPUTE_ASSERT(xo < width_out);
+                    ARM_COMPUTE_ASSERT(yo < height_out);
+
+                    // Compute 3D convolution
+                    convolution_3d::detail::convolution3d(src, weights, bias, dst,
+                                                          offset_in, count * width_weights * height_weights * depth_weights, count, offset_out,
+                                                          xi, yi,
+                                                          width_in, height_in, depth_in,
+                                                          width_weights, height_weights);
+                    count++;
+                }
+            }
+        }
+    }
+
+    return dst;
+}
+
+// Locally Connected only supports F32
+template SimpleTensor<float> locally_connected(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
+                                               const PadStrideInfo &info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/LocallyConnected.h b/tests/validation/reference/LocallyConnected.h
new file mode 100644
index 0000000..bf78d2c
--- /dev/null
+++ b/tests/validation/reference/LocallyConnected.h

@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_LOCALLY_CONNECTED_H__
+#define __ARM_COMPUTE_TEST_LOCALLY_CONNECTED_H__
+
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename TB>
+SimpleTensor<T> locally_connected(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_LOCALLY_CONNECTED_H__ */

diff --git a/tests/validation/reference/OpticalFlow.cpp b/tests/validation/reference/OpticalFlow.cpp
new file mode 100644
index 0000000..da0b9f9
--- /dev/null
+++ b/tests/validation/reference/OpticalFlow.cpp

@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "OpticalFlow.h"
+
+#include "GaussianPyramidHalf.h"
+#include "Scharr.h"
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+using KeyPointArray         = std::vector<KeyPoint>;
+using InternalKeyPointArray = std::vector<InternalKeyPoint>;
+
+// Constants used for Lucas-Kanade Algorithm
+constexpr int   W_BITS                = 14;
+constexpr float D0                    = 1 << W_BITS;
+constexpr float DETERMINANT_THRESHOLD = 1.0e-07f;
+constexpr float EIGENVALUE_THRESHOLD  = 1.0e-04f;
+constexpr float FLT_SCALE             = 1.0f / (1 << 20);
+
+// Creates an InternalKeyPointArray for tracking non-integral pixel coordinates
+InternalKeyPointArray create_internal_keypoints(const KeyPointArray &keypoints)
+{
+    InternalKeyPointArray internal_keypoints;
+
+    for(auto keypoint : keypoints)
+    {
+        InternalKeyPoint internal_keypoint;
+
+        internal_keypoint.x               = static_cast<float>(keypoint.x);
+        internal_keypoint.y               = static_cast<float>(keypoint.y);
+        internal_keypoint.tracking_status = static_cast<bool>(keypoint.tracking_status);
+
+        internal_keypoints.push_back(internal_keypoint);
+    }
+
+    return internal_keypoints;
+}
+
+// Scale tracked points based on Pyramid level
+void scale_tracked_points(size_t level, size_t num_levels, bool use_initial_estimate,
+                          InternalKeyPointArray &old_points_internal, InternalKeyPointArray &new_points_internal,
+                          const KeyPointArray &old_points, const KeyPointArray &new_points_estimates)
+{
+    if(level == num_levels - 1) // lowest resolution
+    {
+        const float scale = std::pow(SCALE_PYRAMID_HALF, level);
+
+        for(size_t i = 0; i < old_points.size(); ++i)
+        {
+            old_points_internal.at(i).x               = old_points.at(i).x * scale;
+            old_points_internal.at(i).y               = old_points.at(i).y * scale;
+            old_points_internal.at(i).tracking_status = true;
+
+            InternalKeyPoint keypoint_to_track;
+
+            if(use_initial_estimate)
+            {
+                keypoint_to_track.x               = new_points_estimates.at(i).x * scale;
+                keypoint_to_track.y               = new_points_estimates.at(i).y * scale;
+                keypoint_to_track.tracking_status = (new_points_estimates.at(i).tracking_status == 1);
+            }
+            else
+            {
+                keypoint_to_track.x               = old_points_internal.at(i).x;
+                keypoint_to_track.y               = old_points_internal.at(i).y;
+                keypoint_to_track.tracking_status = true;
+            }
+
+            new_points_internal.at(i) = keypoint_to_track;
+        }
+    }
+    else
+    {
+        for(size_t i = 0; i < old_points.size(); ++i)
+        {
+            old_points_internal.at(i).x /= SCALE_PYRAMID_HALF;
+            old_points_internal.at(i).y /= SCALE_PYRAMID_HALF;
+            new_points_internal.at(i).x /= SCALE_PYRAMID_HALF;
+            new_points_internal.at(i).y /= SCALE_PYRAMID_HALF;
+        }
+    }
+}
+
+bool is_invalid_keypoint(const InternalKeyPoint &keypoint, const ValidRegion &valid_region, size_t window_dimension)
+{
+    const int half_window = window_dimension / 2;
+    const int x           = std::floor(keypoint.x);
+    const int y           = std::floor(keypoint.y);
+
+    return (x - half_window < valid_region.start(0)) || (x + half_window >= valid_region.end(0) - 1) || (y - half_window < valid_region.start(1)) || (y + half_window >= valid_region.end(1) - 1);
+}
+
+template <typename T>
+constexpr int INT_ROUND(T x, int n)
+{
+    return (x + (1 << (n - 1))) >> n;
+}
+
+// Return the bilinear value at a specified coordinate with different border modes
+template <typename T>
+int bilinear_interpolate(const SimpleTensor<T> &in, Coordinates id, float wx, float wy, BorderMode border_mode, T constant_border_value, int scale)
+{
+    const int level = id.x();
+    const int idy   = id.y();
+
+    const float dx   = wx;
+    const float dy   = wy;
+    const float dx_1 = 1.0f - dx;
+    const float dy_1 = 1.0f - dy;
+
+    const T border_value = constant_border_value;
+
+    id.set(0, level);
+    id.set(1, idy);
+    const T tl = tensor_elem_at(in, id, border_mode, border_value);
+    id.set(0, level + 1);
+    id.set(1, idy);
+    const T tr = tensor_elem_at(in, id, border_mode, border_value);
+    id.set(0, level);
+    id.set(1, idy + 1);
+    const T bl = tensor_elem_at(in, id, border_mode, border_value);
+    id.set(0, level + 1);
+    id.set(1, idy + 1);
+    const T br = tensor_elem_at(in, id, border_mode, border_value);
+
+    // weights
+    const int w00 = roundf(dx_1 * dy_1 * D0);
+    const int w01 = roundf(dx * dy_1 * D0);
+    const int w10 = roundf(dx_1 * dy * D0);
+    const int w11 = D0 - w00 - w01 - w10;
+
+    return static_cast<int>(INT_ROUND(tl * w00 + tr * w01 + bl * w10 + br * w11, scale));
+}
+
+template <typename T>
+std::vector<int> compute_derivative(const SimpleTensor<T> &input, const InternalKeyPoint &keypoint,
+                                    BorderMode border_mode, uint8_t constant_border_value, size_t window_dimension, int scale)
+{
+    std::vector<int> bilinear_values;
+
+    const int half_window = window_dimension / 2;
+
+    float keypoint_int_x = 0;
+    float keypoint_int_y = 0;
+
+    const float wx = std::modf(keypoint.x, &keypoint_int_x);
+    const float wy = std::modf(keypoint.y, &keypoint_int_y);
+
+    Coordinates tl_window(static_cast<int>(keypoint_int_x) - half_window, static_cast<int>(keypoint_int_y) - half_window);
+    Coordinates br_window(static_cast<int>(keypoint_int_x) + half_window, static_cast<int>(keypoint_int_y) + half_window);
+
+    for(int y = tl_window.y(); y <= br_window.y(); ++y)
+    {
+        for(int x = tl_window.x(); x <= br_window.x(); ++x)
+        {
+            bilinear_values.push_back(bilinear_interpolate(input, Coordinates(x, y), wx, wy, border_mode, static_cast<T>(constant_border_value), scale));
+        }
+    }
+
+    return bilinear_values;
+}
+
+std::tuple<float, float, float> compute_spatial_gradient_matrix(const std::vector<int> &bilinear_ix, const std::vector<int> &bilinear_iy)
+{
+    ARM_COMPUTE_ERROR_ON(bilinear_ix.size() != bilinear_iy.size());
+
+    int iA11 = 0;
+    int iA12 = 0;
+    int iA22 = 0;
+
+    for(size_t i = 0; i < bilinear_ix.size(); ++i)
+    {
+        int ixval = bilinear_ix[i];
+        int iyval = bilinear_iy[i];
+
+        iA11 += ixval * ixval;
+        iA12 += ixval * iyval;
+        iA22 += iyval * iyval;
+    }
+
+    return std::make_tuple(iA11 * FLT_SCALE, iA12 * FLT_SCALE, iA22 * FLT_SCALE);
+}
+
+std::tuple<double, double> compute_temporal_gradient_vector(const std::vector<int> &bilinear_it_old,
+                                                            const std::vector<int> &bilinear_it_new,
+                                                            const std::vector<int> &bilinear_ix,
+                                                            const std::vector<int> &bilinear_iy)
+{
+    ARM_COMPUTE_ERROR_ON(bilinear_ix.size() != bilinear_iy.size());
+    ARM_COMPUTE_ERROR_ON(bilinear_it_old.size() != bilinear_it_new.size());
+
+    int ib1 = 0;
+    int ib2 = 0;
+
+    for(size_t i = 0; i < bilinear_ix.size(); ++i)
+    {
+        int ixval = bilinear_ix[i];
+        int iyval = bilinear_iy[i];
+        int ival  = bilinear_it_old[i];
+        int jval  = bilinear_it_new[i];
+
+        const int diff = jval - ival;
+
+        ib1 += diff * ixval;
+        ib2 += diff * iyval;
+    }
+
+    const double b1 = ib1 * FLT_SCALE;
+    const double b2 = ib2 * FLT_SCALE;
+
+    return std::make_tuple(b1, b2);
+}
+} // namespace
+
+template <typename T>
+std::vector<KeyPoint> optical_flow(const SimpleTensor<T> &old_input, const SimpleTensor<T> &new_input,
+                                   const OpticalFlowParameters &params, size_t num_levels,
+                                   const std::vector<KeyPoint> &old_points, const std::vector<KeyPoint> &new_points_estimates,
+                                   BorderMode border_mode, uint8_t constant_border_value)
+{
+    const int    filter_size      = 3;    // scharr filter size
+    const size_t max_iterations   = 1000; // fixed by kernel
+    const size_t window_dimension = params.window_dimension;
+    const size_t num_iterations   = (params.termination == Termination::TERM_CRITERIA_EPSILON) ? max_iterations : params.num_iterations;
+
+    KeyPointArray new_points(old_points.size());
+
+    InternalKeyPointArray old_points_internal = create_internal_keypoints(old_points);
+    InternalKeyPointArray new_points_internal = create_internal_keypoints(new_points_estimates);
+
+    SimpleTensor<int16_t> scharr_gx;
+    SimpleTensor<int16_t> scharr_gy;
+
+    // Create pyramids
+    std::vector<SimpleTensor<T>> old_pyramid = gaussian_pyramid_half(old_input, border_mode, constant_border_value, num_levels);
+    std::vector<SimpleTensor<T>> new_pyramid = gaussian_pyramid_half(new_input, border_mode, constant_border_value, num_levels);
+
+    // Iterate over each level of the pyramid
+    for(size_t idx = num_levels; idx > 0; --idx)
+    {
+        const size_t level = idx - 1;
+
+        // Calculate scharr gradients
+        std::tie(scharr_gx, scharr_gy) = scharr<int16_t, T>(old_pyramid[level], filter_size, border_mode, constant_border_value, GradientDimension::GRAD_XY);
+
+        scale_tracked_points(level, num_levels, params.use_initial_estimate, old_points_internal, new_points_internal, old_points, new_points_estimates);
+
+        // Calculate valid region based on image dimensions of current pyramid level
+        const ValidRegion valid_region = shape_to_valid_region(old_pyramid[level].shape(), (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
+
+        for(size_t i = 0; i < old_points.size(); ++i)
+        {
+            InternalKeyPoint &old_keypoint = old_points_internal.at(i);
+            InternalKeyPoint &new_keypoint = new_points_internal.at(i);
+
+            // Helper function for untracking keypoints when on the lowest pyramid level (high resolution)
+            const auto untrack_keypoint = [&](bool predicate)
+            {
+                if(predicate && (level == 0))
+                {
+                    new_keypoint.tracking_status = false;
+                    return true;
+                }
+                return predicate;
+            };
+
+            if(!old_keypoint.tracking_status)
+            {
+                continue;
+            }
+
+            // Check if tracked coordinate is outside image coordinate
+            if(untrack_keypoint(is_invalid_keypoint(old_keypoint, valid_region, window_dimension)))
+            {
+                continue;
+            }
+
+            // Compute spatial derivative
+            std::vector<int> bilinear_ix = compute_derivative(scharr_gx, old_keypoint, border_mode, constant_border_value, window_dimension, W_BITS);
+            std::vector<int> bilinear_iy = compute_derivative(scharr_gy, old_keypoint, border_mode, constant_border_value, window_dimension, W_BITS);
+
+            float A11 = 0.f;
+            float A12 = 0.f;
+            float A22 = 0.f;
+            std::tie(A11, A12, A22) = compute_spatial_gradient_matrix(bilinear_ix, bilinear_iy);
+
+            // Calculate criteria for lost tracking : Matrix A is invertible
+            // 1. The determinant of the matrix is less than DETERMINANT_THRESHOLD
+            // 2. The minimum eigenvalue of the matrix is less than EIGENVALUE_THRESHOLD
+            const float trace_A      = A11 + A22;
+            const float determinant  = A11 * A22 - A12 * A12;
+            const float discriminant = (trace_A * trace_A) - 4.0f * (determinant);
+            const float eigenvalue_A = (trace_A - std::sqrt(discriminant)) / 2.0f;
+
+            // Divide by window_dimension squared to reduce the floating point accummulation error
+            const float eigenvalue = eigenvalue_A / (window_dimension * window_dimension);
+
+            // Check if it is a good point to track
+            if(untrack_keypoint(eigenvalue < EIGENVALUE_THRESHOLD || determinant < DETERMINANT_THRESHOLD))
+            {
+                continue;
+            }
+
+            float prev_delta_x = 0.f;
+            float prev_delta_y = 0.f;
+
+            for(size_t j = 0; j < num_iterations; ++j)
+            {
+                // Check if tracked coordinate is outside image coordinate
+                if(untrack_keypoint(is_invalid_keypoint(new_keypoint, valid_region, window_dimension)))
+                {
+                    break;
+                }
+
+                // Compute temporal derivative
+                std::vector<int> bilinear_it_old = compute_derivative(old_pyramid[level], old_keypoint, border_mode, constant_border_value, window_dimension, W_BITS - 5);
+                std::vector<int> bilinear_it_new = compute_derivative(new_pyramid[level], new_keypoint, border_mode, constant_border_value, window_dimension, W_BITS - 5);
+
+                double b1 = 0.f;
+                double b2 = 0.f;
+                std::tie(b1, b2) = compute_temporal_gradient_vector(bilinear_it_old, bilinear_it_new, bilinear_ix, bilinear_iy);
+
+                // Compute motion vector -> A^-1 * -b
+                const float delta_x = (A12 * b2 - A22 * b1) / determinant;
+                const float delta_y = (A12 * b1 - A11 * b2) / determinant;
+
+                // Update the new position
+                new_keypoint.x += delta_x;
+                new_keypoint.y += delta_y;
+
+                const float magnitude_squared = delta_x * delta_x + delta_y * delta_y;
+
+                // Check if termination criteria is EPSILON and if it is satisfied
+                if(magnitude_squared <= params.epsilon && (params.termination == Termination::TERM_CRITERIA_EPSILON || params.termination == Termination::TERM_CRITERIA_BOTH))
+                {
+                    break;
+                }
+
+                // Check convergence analyzing the previous delta
+                if(j > 0 && (std::fabs(delta_x + prev_delta_x) < 0.01f && std::fabs(delta_y + prev_delta_y) < 0.01f))
+                {
+                    new_keypoint.x -= delta_x * SCALE_PYRAMID_HALF;
+                    new_keypoint.y -= delta_y * SCALE_PYRAMID_HALF;
+
+                    break;
+                }
+
+                prev_delta_x = delta_x;
+                prev_delta_y = delta_y;
+            }
+        }
+    }
+
+    // Copy optical flow coordinates to output vector
+    for(size_t i = 0; i < old_points.size(); ++i)
+    {
+        const InternalKeyPoint &new_keypoint = new_points_internal.at(i);
+
+        new_points.at(i).x               = roundf(new_keypoint.x);
+        new_points.at(i).y               = roundf(new_keypoint.y);
+        new_points.at(i).tracking_status = new_keypoint.tracking_status ? 1 : 0;
+    }
+
+    return new_points;
+}
+
+template std::vector<KeyPoint> optical_flow(const SimpleTensor<uint8_t> &old_input, const SimpleTensor<uint8_t> &new_input,
+                                            const OpticalFlowParameters &params, size_t num_levels,
+                                            const std::vector<KeyPoint> &old_points, const std::vector<KeyPoint> &new_points_estimates,
+                                            BorderMode border_mode, uint8_t constant_border_value);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/OpticalFlow.h b/tests/validation/reference/OpticalFlow.h
new file mode 100644
index 0000000..ad6e2a9
--- /dev/null
+++ b/tests/validation/reference/OpticalFlow.h

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_OPTICAL_FLOW_H__
+#define __ARM_COMPUTE_TEST_OPTICAL_FLOW_H__
+
+#include "tests/SimpleTensor.h"
+#include "tests/Types.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+std::vector<KeyPoint> optical_flow(const SimpleTensor<T> &old_input, const SimpleTensor<T> &new_input,
+                                   const OpticalFlowParameters &params, size_t num_levels,
+                                   const std::vector<KeyPoint> &old_points, const std::vector<KeyPoint> &new_points_estimates,
+                                   BorderMode border_mode, uint8_t constant_border_value);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_OPTICAL_FLOW_H__ */

diff --git a/tests/validation/reference/Permute.cpp b/tests/validation/reference/Permute.cpp
index 4a12ca6..bbb2e8d 100644
--- a/tests/validation/reference/Permute.cpp
+++ b/tests/validation/reference/Permute.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,7 @@
     permute(dst_shape, perm);
 
     // Create reference
-    SimpleTensor<T> dst{ dst_shape, src.data_type() };
+    SimpleTensor<T> dst{ dst_shape, src.data_type(), src.num_channels(), src.fixed_point_position(), src.quantization_info() };
 
     // Compute reference
     for(int i = 0; i < src.num_elements(); ++i)
@@ -57,9 +57,13 @@
     return dst;
 }
 
+template SimpleTensor<int8_t> permute(const SimpleTensor<int8_t> &src, PermutationVector perm);
 template SimpleTensor<uint8_t> permute(const SimpleTensor<uint8_t> &src, PermutationVector perm);
+template SimpleTensor<int16_t> permute(const SimpleTensor<int16_t> &src, PermutationVector perm);
 template SimpleTensor<uint16_t> permute(const SimpleTensor<uint16_t> &src, PermutationVector perm);
 template SimpleTensor<uint32_t> permute(const SimpleTensor<uint32_t> &src, PermutationVector perm);
+template SimpleTensor<float> permute(const SimpleTensor<float> &src, PermutationVector perm);
+template SimpleTensor<half> permute(const SimpleTensor<half> &src, PermutationVector perm);
 } // namespace reference
 } // namespace validation
 } // namespace test

diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index c14ab98..6973454 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp

@@ -24,6 +24,7 @@
 #include "PoolingLayer.h"
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
@@ -35,30 +36,16 @@
 {
 namespace reference
 {
-namespace
-{
-TensorShape calculate_output_shape(TensorShape shape, const PoolingLayerInfo &info)
-{
-    TensorShape dst_shape   = shape;
-    const int   pool_size_x = info.is_global_pooling() ? shape.x() : info.pool_size().width;
-    const int   pool_size_y = info.is_global_pooling() ? shape.y() : info.pool_size().height;
-    const std::pair<unsigned int, unsigned int> scaled_dims = arm_compute::scaled_dimensions(shape.x(),
-                                                                                             shape.y(),
-                                                                                             pool_size_x,
-                                                                                             pool_size_y,
-                                                                                             info.pad_stride_info());
-    dst_shape.set(0, scaled_dims.first);
-    dst_shape.set(1, scaled_dims.second);
-
-    return dst_shape;
-}
-} // namespace
+using namespace arm_compute::misc::shape_calculator;
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
 SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON(info.is_global_pooling() && (src.shape().x() != src.shape().y()));
 
+    // Create reference
+    SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type(), src.fixed_point_position()), info), src.data_type(), 1, src.fixed_point_position() };
+
     const int   pool_size_x     = info.is_global_pooling() ? src.shape().x() : info.pool_size().width;
     const int   pool_size_y     = info.is_global_pooling() ? src.shape().y() : info.pool_size().height;
     PoolingType type            = info.pool_type();
@@ -74,9 +61,6 @@
     const auto h_src      = static_cast<int>(src.shape()[1]);
     const int  upper_dims = src.shape().total_size() / (w_src * h_src);
 
-    // Create reference
-    SimpleTensor<T> dst{ calculate_output_shape(src.shape(), info), src.data_type(), 1, src.fixed_point_position() };
-
     const auto w_dst = static_cast<int>(dst.shape()[0]);
     const auto h_dst = static_cast<int>(dst.shape()[1]);
 
@@ -173,6 +157,10 @@
 {
     ARM_COMPUTE_ERROR_ON(info.is_global_pooling() && (src.shape().x() != src.shape().y()));
 
+    const auto w_src      = static_cast<int>(src.shape()[0]);
+    const auto h_src      = static_cast<int>(src.shape()[1]);
+    const int  upper_dims = src.shape().total_size() / (w_src * h_src);
+
     const int   pool_size_x     = info.is_global_pooling() ? src.shape().x() : info.pool_size().width;
     const int   pool_size_y     = info.is_global_pooling() ? src.shape().y() : info.pool_size().height;
     PoolingType type            = info.pool_type();
@@ -184,12 +172,8 @@
     int         pad_bottom      = info.pad_stride_info().pad_bottom();
     bool        exclude_padding = info.exclude_padding();
 
-    const auto w_src      = static_cast<int>(src.shape()[0]);
-    const auto h_src      = static_cast<int>(src.shape()[1]);
-    const int  upper_dims = src.shape().total_size() / (w_src * h_src);
-
     // Create reference
-    SimpleTensor<T> dst{ calculate_output_shape(src.shape(), info), src.data_type(), 1, src.fixed_point_position() };
+    SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type(), src.fixed_point_position()), info), src.data_type(), 1, src.fixed_point_position() };
 
     const auto w_dst = static_cast<int>(dst.shape()[0]);
     const auto h_dst = static_cast<int>(dst.shape()[1]);

diff --git a/tests/validation/reference/Scale.cpp b/tests/validation/reference/Scale.cpp
index 0cc96ab..f8a8b88 100644
--- a/tests/validation/reference/Scale.cpp
+++ b/tests/validation/reference/Scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include "Scale.h"
+
 #include "Utils.h"
-#include "arm_compute/core/utils/misc/utility.h"
+#include "arm_compute/core/utils/misc/Utility.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute

diff --git a/tests/validation/reference/Utils.h b/tests/validation/reference/Utils.h
index 2aa77c6..0e98bbe 100644
--- a/tests/validation/reference/Utils.h
+++ b/tests/validation/reference/Utils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,11 +62,13 @@
 {
     const int x      = coord.x();
     const int y      = coord.y();
+    const int z      = coord.z();
     const int width  = src.shape().x();
     const int height = src.shape().y();
+    const int depth  = src.shape().z();
 
     // If coordinates beyond range of tensor's width or height
-    if(x < 0 || y < 0 || x >= width || y >= height)
+    if(x < 0 || y < 0 || z < 0 || x >= width || y >= height || z >= depth)
     {
         if(border_mode == BorderMode::REPLICATE)
         {

diff --git a/tests/validation/reference/WidthConcatenateLayer.cpp b/tests/validation/reference/WidthConcatenateLayer.cpp
new file mode 100644
index 0000000..fe79b4a
--- /dev/null
+++ b/tests/validation/reference/WidthConcatenateLayer.cpp

@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "WidthConcatenateLayer.h"
+
+#include "tests/validation/FixedPoint.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> widthconcatenate_layer(const std::vector<SimpleTensor<T>> &srcs)
+{
+    // Create reference
+    std::vector<TensorShape> shapes;
+
+    for(const auto &src : srcs)
+    {
+        shapes.emplace_back(src.shape());
+    }
+
+    DataType        dst_type  = srcs.empty() ? DataType::UNKNOWN : srcs[0].data_type();
+    TensorShape     dst_shape = calculate_width_concatenate_shape(shapes);
+    SimpleTensor<T> dst(dst_shape, dst_type);
+
+    // Compute reference
+    int       width_offset = 0;
+    const int width_out    = dst.shape().x();
+
+    // Set output tensor to 0
+    std::fill_n(dst.data(), dst.num_elements(), 0);
+
+    for(const auto &src : srcs)
+    {
+        ARM_COMPUTE_ERROR_ON(width_offset >= width_out);
+
+        const int width  = src.shape().x();
+        const int height = src.shape().y();
+        const int depth  = src.shape().z();
+
+        const T *src_ptr = src.data();
+        T       *dst_ptr = dst.data();
+
+        for(int d = 0; d < depth; ++d)
+        {
+            for(int r = 0; r < height; ++r)
+            {
+                int offset = d * height + r;
+                std::copy(src_ptr, src_ptr + width, dst_ptr + width_offset + offset * width_out);
+                src_ptr += width;
+            }
+        }
+
+        width_offset += width;
+    }
+
+    return dst;
+}
+
+template SimpleTensor<float> widthconcatenate_layer(const std::vector<SimpleTensor<float>> &srcs);
+template SimpleTensor<half> widthconcatenate_layer(const std::vector<SimpleTensor<half>> &srcs);
+template SimpleTensor<qint8_t> widthconcatenate_layer(const std::vector<SimpleTensor<qint8_t>> &srcs);
+template SimpleTensor<qint16_t> widthconcatenate_layer(const std::vector<SimpleTensor<qint16_t>> &srcs);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/WidthConcatenateLayer.h b/tests/validation/reference/WidthConcatenateLayer.h
new file mode 100644
index 0000000..237e72b
--- /dev/null
+++ b/tests/validation/reference/WidthConcatenateLayer.h

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_WIDTHCONCATENATE_LAYER_H__
+#define __ARM_COMPUTE_TEST_WIDTHCONCATENATE_LAYER_H__
+
+#include "tests/SimpleTensor.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> widthconcatenate_layer(const std::vector<SimpleTensor<T>> &srcs);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_WIDTHCONCATENATE_LAYER_H__ */

diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
new file mode 100644
index 0000000..194a78e
--- /dev/null
+++ b/tests/validation/reference/Winograd.cpp

@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Winograd.h"
+
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Utils.h"
+
+#include "arm_compute/core/Types.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+template <typename T>
+void initialize_matrix_transform(SimpleTensor<T> &src, const Size2D &output_tile_size, const Size2D &kernel_size, WinogradTransformType winograd_transform_type)
+{
+    // Winograd input transform matrices
+    static const float imatrix2x2_3x3[] =
+    {
+        1.0f, 0.0f, -1.0f, 0.0f,
+        0.0f, 1.0f, 1.0f, 0.0f,
+        0.0f, -1.0f, 1.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, -1.0f
+    };
+
+    static const float imatrix4x4_3x3[] =
+    {
+        4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f,
+        0.0f, -4.0f, -4.0f, 1.0f, 1.0f, 0.0f,
+        0.0f, 4.0f, -4.0f, -1.0f, 1.0f, 0.0f,
+        0.0f, -2.0f, -1.0f, 2.0f, 1.0f, 0.0f,
+        0.0f, 2.0f, -1.0f, -2.0f, 1.0f, 0.0f,
+        0.0f, 4.0f, 0.0f, -5.0f, 0.0f, 1.0f,
+    };
+
+    static const float imatrix4x4_5x5[] =
+    {
+        1.f, 0.f, -21.f / 4.f, 0.f, 21.f / 4.f, 0.f, -1.f, 0.f,
+        0.f, 1.f, 1.f, -17.f / 4.f, -17.f / 4.f, 1.f, 1.f, 0.f,
+        0.f, -1.f, 1.f, 17.f / 4.f, -17.f / 4.f, -1.f, 1.f, 0.f,
+        0.f, 1.f / 2.f, 1.f / 4.f, -5.f / 2.f, -5.f / 4.f, 2.f, 1.f, 0.f,
+        0.f, -1.f / 2.f, 1.f / 4.f, 5.f / 2.f, -5.f / 4.f, -2.f, 1.f, 0.f,
+        0.f, 2.f, 4.f, -5.f / 2.f, -5.f, 1.f / 2.f, 1.f, 0.f,
+        0.f, -2.f, 4.f, 5.f / 2.f, -5.f, -1.f / 2.f, 1.f, 0.f,
+        0.f, -1.f, 0.f, 21.f / 4.f, 0.f, -21.f / 4.f, 0.f, 1.f
+    };
+
+    // ------------------------------------------
+
+    // Winograd filter transform matrices
+    static const float fmatrix2x2_3x3[] =
+    {
+        1.0f, 0.0f, 0.0f,
+        0.5f, 0.5f, 0.5f,
+        0.5f, -0.5f, 0.5f,
+        0.0f, 0.0f, 1.0f
+    };
+
+    static const float fmatrix4x4_3x3[] =
+    {
+        0.25f, 0.0f, 0.0f,
+        -1.0f / 6.0f, -1.0f / 6.0f, -1.0f / 6.0f,
+        -1.0f / 6.0f, 1.0f / 6.0f, -1.0f / 6.0f,
+        1.0f / 24.0f, 1.0f / 12.0f, 1.0f / 6.0f,
+        1.0f / 24.0f, -1.0f / 12.0f, 1.0f / 6.0f,
+        0.0f, 0.0f, 1.0f
+    };
+
+    static const float fmatrix4x4_5x5[] =
+    {
+        1.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        -2.0f / 9.0f, -2.0f / 9.0f, -2.0f / 9.0f, -2.0f / 9.0f, -2.0f / 9.0f,
+        -2.0f / 9.0f, 2.0f / 9.0f, -2.0f / 9.0f, 2.0f / 9.0f, -2.0f / 9.0f,
+        1.0f / 90.0f, 1.0f / 45.0f, 2.0f / 45.0f, 4.0f / 45.0f, 8.0f / 45.0f,
+        1.0f / 90.0f, -1.0f / 45.0f, 2.0f / 45.0f, -4.0f / 45.0f, 8.0f / 45.0f,
+        4.0f / 45.0f, 2.0f / 45.0f, 1.0f / 45.0f, 1.0f / 90.0f, 1.0f / 180.0f,
+        4.0f / 45.0f, -2.0f / 45.0f, 1.0f / 45.0f, -1.0f / 90.0f, 1.0f / 180.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 1.0f
+
+    };
+
+    // ------------------------------------------
+
+    // Winograd output transform matrices
+    static const float omatrix2x2_3x3[] =
+    {
+        1.0f, 1.0f, 1.0f, 0.0f,
+        0.0f, 1.0f, -1.0f, -1.0f
+    };
+
+    static const float omatrix4x4_3x3[] =
+    {
+        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f,
+        0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f,
+        0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f,
+        0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f
+    };
+
+    static const float omatrix4x4_5x5[] =
+    {
+        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 8.0f, 8.0f, 0.0f,
+        0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 4.0f, -4.0f, 0.0f,
+        0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 2.0f, 2.0f, 0.0f,
+        0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f, -1.0f, 1.0f
+    };
+
+    // ------------------------------------------
+
+    using WinogradKey = std::tuple<std::pair<int, int>, std::pair<int, int>, WinogradTransformType>;
+
+    // Key = (Output tile size, Kernel size, Winograd transform type)
+    static std::map<WinogradKey, const float *> matrix_map =
+    {
+        { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3), WinogradTransformType::INPUT), imatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3), WinogradTransformType::INPUT), imatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::INPUT), imatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3), WinogradTransformType::FILTER), fmatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3), WinogradTransformType::FILTER), fmatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::FILTER), fmatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3), WinogradTransformType::OUTPUT), omatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3), WinogradTransformType::OUTPUT), omatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::OUTPUT), omatrix4x4_5x5 },
+    };
+
+    // Find transformation matrix
+    std::map<WinogradKey, const float *>::iterator it;
+
+    it = matrix_map.find(WinogradKey(std::pair<int, int>(output_tile_size.width, output_tile_size.height),
+                                     std::pair<int, int>(kernel_size.width, kernel_size.height),
+                                     winograd_transform_type));
+
+    float const *matrix_values = nullptr;
+    if(it != matrix_map.end())
+    {
+        // Get matrix pointer
+        matrix_values = it->second;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Winograd configuration not supported");
+    }
+
+    // Copy values
+    std::copy(&matrix_values[0], &matrix_values[0] + src.num_elements(), &src[0]);
+}
+} // namespace
+
+template <typename T>
+SimpleTensor<T> winograd_input_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_ERROR_ON(in.data_layout() != DataLayout::NCHW);
+
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+
+    SimpleTensor<T> out{ output_shape, in.data_type() };
+
+    // Calculate dimensions for the tile
+    const unsigned int tile_w = output_tile_size.width + kernel_size.width - 1;
+    const unsigned int tile_h = output_tile_size.height + kernel_size.height - 1;
+
+    TensorShape tile_dims(tile_w, tile_h);
+
+    // Simple tensor for the input tile
+    SimpleTensor<T> src_tile{ tile_dims, in.data_type() };
+
+    // Simple tensor for the temporary tile
+    SimpleTensor<T> tmp_tile{ tile_dims, in.data_type() };
+
+    // Simple tensor for the output tile
+    SimpleTensor<T> dst_tile{ tile_dims, in.data_type() };
+
+    // Simple tensor for the transformation matrix
+    SimpleTensor<T> matrix{ tile_dims, in.data_type() };
+
+    // Simple tensor for the transformation matrix transposed
+    SimpleTensor<T> matrix_transposed{ tile_dims, in.data_type() };
+
+    // Initialize matrix for the input transform
+    initialize_matrix_transform(matrix, output_tile_size, kernel_size, WinogradTransformType::INPUT);
+
+    // Transpose matrix
+    transpose_matrix(matrix, matrix_transposed);
+
+    const int in_w        = in.shape().x();
+    const int in_h        = in.shape().y();
+    const int in_d        = in.shape().z();
+    const int out_d       = out.shape().z();
+    const int num_batches = in.shape().total_size() / (in_w * in_h * in_d);
+    const int num_tiles_x = std::ceil((in_w - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>(output_tile_size.width));
+    const int num_tiles_y = std::ceil((in_h - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>(output_tile_size.height));
+    const int step_x      = output_tile_size.width;
+    const int step_y      = output_tile_size.height;
+
+    ARM_COMPUTE_ERROR_ON((num_tiles_x * num_tiles_y) != static_cast<int>(out.shape().y()));
+
+    for(int b = 0; b < num_batches; ++b)
+    {
+        for(int z = 0; z < in_d; ++z)
+        {
+            for(int y = 0; y < num_tiles_y; ++y)
+            {
+                for(int x = 0; x < num_tiles_x; ++x)
+                {
+                    int xi = x * step_x - conv_info.pad_left();
+                    int yi = y * step_y - conv_info.pad_top();
+
+                    // Get the tile from the input tensor
+                    get_tile(in, src_tile, Coordinates(xi, yi, z, b));
+
+                    // Compute the transformation
+                    matrix_multiply(matrix, src_tile, tmp_tile);
+                    matrix_multiply(tmp_tile, matrix_transposed, dst_tile);
+
+                    // Store the output tile across the channels
+                    for(int i = 0; i < out_d; ++i)
+                    {
+                        int xo = z;
+                        int yo = x + y * num_tiles_x;
+                        out[coords2index(out.shape(), Coordinates(xo, yo, i, b))] = dst_tile[i];
+                    }
+                }
+            }
+        }
+    }
+
+    return out;
+}
+
+template <typename T>
+SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format");
+
+    // Create reference
+    SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
+
+    const Size2D output_tile_size = winograd_info.output_tile_size;
+    const Size2D kernel_size      = winograd_info.kernel_size;
+
+    TensorShape kernel_tile_dims(kernel_size.width, kernel_size.height);
+
+    // Calculate dimensions for the tile
+    const unsigned int input_tile_w    = output_tile_size.width + kernel_size.width - 1;
+    const unsigned int input_tile_h    = output_tile_size.height + kernel_size.height - 1;
+    const unsigned int input_tile_area = input_tile_w * input_tile_h;
+
+    // Simple tensor for the input tile
+    SimpleTensor<T> input_tile{ kernel_tile_dims, in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix
+    SimpleTensor<T> trans_matrix{ TensorShape(kernel_tile_dims[0], input_tile_w), in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix transpose
+    SimpleTensor<T> trans_matrix_transposed{ TensorShape(input_tile_w, kernel_tile_dims[0]), in.data_type(), 1 };
+
+    // Simple tensor for the temporary tile
+    SimpleTensor<T> tmp_tile{ TensorShape(kernel_tile_dims[0], input_tile_w), in.data_type(), 1 };
+
+    // Simple tensor for the output tile
+    SimpleTensor<T> transf_tile{ TensorShape(input_tile_w, input_tile_w), in.data_type(), 1 };
+
+    // Initialize matrix for the filter transform
+    initialize_matrix_transform(trans_matrix, output_tile_size, kernel_size, WinogradTransformType::FILTER);
+
+    // Transpose the transformation matrix
+    transpose_matrix(trans_matrix, trans_matrix_transposed);
+
+    const int num_channels = in.shape()[2];
+    const int num_filters  = in.shape()[3];
+    const int num_batches  = in.shape().total_size() / (kernel_size.area() * num_channels * num_filters);
+
+    for(int n = 0; n < num_batches; ++n)
+    {
+        for(int w = 0; w < num_filters; ++w)
+        {
+            for(int z = 0; z < num_channels; ++z)
+            {
+                // Load the tile from the input tensor
+                get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+
+                // First transformation
+                matrix_multiply(trans_matrix, input_tile, tmp_tile);
+
+                // Second transformation
+                matrix_multiply(tmp_tile, trans_matrix_transposed, transf_tile);
+
+                // Store the output tile across the channels
+                const int output_offset = w + z * num_filters;
+
+                // Store the values across the channels
+                for(unsigned int i = 0; i < input_tile_area; ++i)
+                {
+                    out[output_offset + i * num_filters * num_channels] = transf_tile[i];
+                }
+            }
+        }
+    }
+
+    return out;
+}
+
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const SimpleTensor<T> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(winograd_info.output_data_layout != DataLayout::NCHW, "Only supported NCHW data format");
+
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        input_dimensions = winograd_info.input_dimensions;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+
+    // Create reference
+    SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
+
+    // Calculate dimensions for the tiles
+    const unsigned int in_tile_w  = output_tile_size.width + kernel_size.width - 1;
+    const unsigned int in_tile_h  = output_tile_size.height + kernel_size.height - 1;
+    const unsigned int out_tile_w = output_tile_size.width;
+    const unsigned int out_tile_h = output_tile_size.height;
+
+    ARM_COMPUTE_ERROR_ON(in.shape()[2] != (in_tile_w * in_tile_h));
+    ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]);
+
+    // Compute tile dimensions
+    // Input tile dimensions
+    TensorShape in_tile_dims(in_tile_w, in_tile_h);
+
+    // Output tile dimensions
+    TensorShape out_tile_dims(output_tile_size.width, output_tile_size.height);
+
+    // Transformation matrix dimensions
+    TensorShape tr_tile_dims(in_tile_w, output_tile_size.width);
+
+    // Create tensors
+    // Simple tensor for the input tile
+    SimpleTensor<T> input_tile{ in_tile_dims, in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix
+    SimpleTensor<T> trans_matrix{ tr_tile_dims, in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix transpose
+    SimpleTensor<T> trans_matrix_transposed{ TensorShape(tr_tile_dims[1], tr_tile_dims[0]), in.data_type(), 1 };
+
+    // Simple tensor for the temporary tile
+    SimpleTensor<T> tmp_tile{ tr_tile_dims, in.data_type(), 1 };
+
+    // Simple tensor for the output tile
+    SimpleTensor<T> output_tile{ out_tile_dims, in.data_type(), 1 };
+
+    // Initialize matrix for the output transform
+    initialize_matrix_transform(trans_matrix, output_tile_size, kernel_size, WinogradTransformType::OUTPUT);
+
+    // Transpose the transformation matrix
+    transpose_matrix(trans_matrix, trans_matrix_transposed);
+
+    const int w_in        = in.shape()[0];
+    const int h_in        = in.shape()[1];
+    const int c_in        = in.shape()[2];
+    const int w_out       = out.shape()[0];
+    const int h_out       = out.shape()[1];
+    const int c_out       = out.shape()[2];
+    const int num_batches = in.shape().total_size() / (w_in * h_in * c_in);
+
+    // Input strides
+    const int stridey_in = w_in;
+    const int stridez_in = stridey_in * h_in;
+    const int stridew_in = stridez_in * c_in;
+
+    // Output strides
+    const int stridey_out = w_out;
+    const int stridez_out = stridey_out * h_out;
+    const int stridew_out = stridez_out * c_out;
+
+    // Compute number of elements to process in the X and Y direction
+    const int num_elements_x = input_dimensions.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
+    const int num_elements_y = input_dimensions.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
+    const int num_tiles_x    = std::ceil(num_elements_x / static_cast<float>(output_tile_size.width));
+    const int num_tiles_y    = std::ceil(num_elements_y / static_cast<float>(output_tile_size.height));
+
+    ARM_COMPUTE_UNUSED(num_tiles_y);
+    ARM_COMPUTE_ERROR_ON(in.shape()[1] != static_cast<unsigned int>(num_tiles_x * num_tiles_y));
+
+    for(int n = 0; n < num_batches; ++n)
+    {
+        for(int y = 0; y < h_in; ++y)
+        {
+            for(int x = 0; x < w_in; ++x)
+            {
+                // Load the input tile tile across the channels of the input tensor
+                for(int z = 0; z < c_in; ++z)
+                {
+                    input_tile[z] = in[x + (y * stridey_in) + (z * stridez_in) + (n * stridew_in)];
+                }
+
+                // First transformation
+                matrix_multiply(trans_matrix, input_tile, tmp_tile);
+
+                // Second transformation
+                matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
+
+                // Store the output tile
+                const int xo = (y % num_tiles_x) * out_tile_w;
+                const int yo = (y / num_tiles_x) * out_tile_h;
+                const int zo = x;
+
+                const int output_offset = xo + (yo * stridey_out) + (zo * stridez_out) + (n * stridew_out);
+
+                for(int yi = 0; yi < static_cast<int>(out_tile_h); ++yi)
+                {
+                    for(int xi = 0; xi < static_cast<int>(out_tile_w); ++xi)
+                    {
+                        // Check out-of-bound writes
+                        if((xo + xi < w_out) && (yo + yi < h_out))
+                        {
+                            out[output_offset + yi * stridey_out + xi] = output_tile[xi + yi * out_tile_w];
+
+                            // Add bias
+                            out[output_offset + yi * stridey_out + xi] += b[zo];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return out;
+}
+
+template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const SimpleTensor<float> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/Winograd.h b/tests/validation/reference/Winograd.h
new file mode 100644
index 0000000..b74c2c3
--- /dev/null
+++ b/tests/validation/reference/Winograd.h

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_WINOGRAD_H__
+#define __ARM_COMPUTE_TEST_WINOGRAD_H__
+
+#include "arm_compute/core/TensorShape.h"
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+/** Winograd transform type */
+enum class WinogradTransformType
+{
+    INPUT,  /**< Winograd input transform */
+    FILTER, /**< Winograd filter transform */
+    OUTPUT  /**< Winograd output transform */
+};
+
+template <typename T>
+SimpleTensor<T> winograd_input_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+
+template <typename T>
+SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const SimpleTensor<T> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_WINOGRAD_H__ */
commit	b3a371bc429d2ba45e56baaf239d8200c2662a74	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Wed May 23 11:36:53 2018 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Wed May 23 14:55:11 2018 +0100
tree	554525e415c303d64a08722a755397852ebbb8e4
parent	67c8c91522e5be8156b77f57e63c0253535c902a [diff]