arm_compute v18.05
diff --git a/tests/validation/reference/BatchNormalizationLayer.cpp b/tests/validation/reference/BatchNormalizationLayer.cpp
index a9d9f03..c8badac 100644
--- a/tests/validation/reference/BatchNormalizationLayer.cpp
+++ b/tests/validation/reference/BatchNormalizationLayer.cpp
@@ -106,7 +106,6 @@
                     const float numerator   = src[pos] - mean[i];
                     const float x_bar       = numerator / denominator;
                     result[pos]             = beta[i] + x_bar * gamma[i];
-                    ;
                 }
             }
         }
diff --git a/tests/validation/reference/ChannelCombine.cpp b/tests/validation/reference/ChannelCombine.cpp
new file mode 100644
index 0000000..c1ec3ec
--- /dev/null
+++ b/tests/validation/reference/ChannelCombine.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ChannelCombine.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/FixedPoint.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+template <typename T>
+inline std::vector<SimpleTensor<T>> create_image_planes(const TensorShape &shape, Format format)
+{
+    TensorShape image_shape = adjust_odd_shape(shape, format);
+
+    std::vector<SimpleTensor<T>> image_planes;
+
+    switch(format)
+    {
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            image_planes.emplace_back(image_shape, format);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            TensorShape shape_uv88 = calculate_subsampled_shape(image_shape, Format::UV88);
+
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(shape_uv88, Format::UV88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorShape shape_sub2 = calculate_subsampled_shape(image_shape, Format::IYUV);
+
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(shape_sub2, Format::U8);
+            image_planes.emplace_back(shape_sub2, Format::U8);
+            break;
+        }
+        case Format::YUV444:
+        {
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(image_shape, Format::U8);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    return image_planes;
+}
+} // namespace
+
+template <typename T>
+std::vector<SimpleTensor<T>> channel_combine(const TensorShape &shape, const std::vector<SimpleTensor<T>> &image_planes, Format format)
+{
+    std::vector<SimpleTensor<T>> dst = create_image_planes<T>(shape, format);
+
+    for(unsigned int plane_idx = 0; plane_idx < dst.size(); ++plane_idx)
+    {
+        SimpleTensor<T> &dst_tensor = dst[plane_idx];
+
+        for(int element_idx = 0; element_idx < dst_tensor.num_elements(); ++element_idx)
+        {
+            Coordinates coord = index2coord(dst_tensor.shape(), element_idx);
+
+            switch(format)
+            {
+                case Format::RGB888:
+                case Format::RGBA8888:
+                {
+                    // Copy R/G/B or A channel
+                    for(int channel_idx = 0; channel_idx < dst_tensor.num_channels(); ++channel_idx)
+                    {
+                        const T &src_value = reinterpret_cast<const T *>(image_planes[channel_idx](coord))[0];
+                        T       &dst_value = reinterpret_cast<T *>(dst_tensor(coord))[channel_idx];
+
+                        dst_value = src_value;
+                    }
+                    break;
+                }
+                case Format::YUYV422:
+                case Format::UYVY422:
+                {
+                    // Find coordinates of the sub-sampled pixel
+                    const Coordinates coord_hori(coord.x() / 2, coord.y());
+
+                    const T &src0 = reinterpret_cast<const T *>(image_planes[0](coord))[0];
+                    const T &src1 = reinterpret_cast<const T *>(image_planes[1](coord_hori))[0];
+
+                    const int shift = (Format::YUYV422 == format) ? 1 : 0;
+                    T        &dst0  = reinterpret_cast<T *>(dst_tensor(coord))[1 - shift];
+                    T        &dst1  = reinterpret_cast<T *>(dst_tensor(coord))[0 + shift];
+
+                    dst0 = src0;
+                    dst1 = src1;
+
+                    Coordinates coord2 = index2coord(dst_tensor.shape(), ++element_idx);
+
+                    const T &src2 = reinterpret_cast<const T *>(image_planes[0](coord2))[0];
+                    const T &src3 = reinterpret_cast<const T *>(image_planes[2](coord_hori))[0];
+
+                    T &dst2 = reinterpret_cast<T *>(dst_tensor(coord2))[1 - shift];
+                    T &dst3 = reinterpret_cast<T *>(dst_tensor(coord2))[0 + shift];
+
+                    dst2 = src2;
+                    dst3 = src3;
+
+                    break;
+                }
+                case Format::NV12:
+                case Format::NV21:
+                {
+                    if(0U == plane_idx)
+                    {
+                        // Get and combine Y channel from plane0 of destination multi-image
+                        dst_tensor[element_idx] = image_planes[0][element_idx];
+                    }
+                    else
+                    {
+                        const int shift = (Format::NV12 == format) ? 0 : 1;
+
+                        // Get U channel from plane1 and V channel from plane2 of the source
+                        const T &src_u0 = reinterpret_cast<const T *>(image_planes[1](coord))[0];
+                        const T &src_v0 = reinterpret_cast<const T *>(image_planes[2](coord))[0];
+
+                        // Get U and V channel from plane1 of destination multi-image
+                        T &dst_u0 = reinterpret_cast<T *>(dst_tensor(coord))[0 + shift];
+                        T &dst_v0 = reinterpret_cast<T *>(dst_tensor(coord))[1 - shift];
+
+                        // Combine channel U and V
+                        dst_u0 = src_u0;
+                        dst_v0 = src_v0;
+                    }
+
+                    break;
+                }
+                case Format::IYUV:
+                case Format::YUV444:
+                {
+                    // Get Y/U/V element
+                    const T &src = reinterpret_cast<const T *>(image_planes[plane_idx](coord))[0];
+                    T       &dst = reinterpret_cast<T *>(dst_tensor(coord))[0];
+
+                    // Copy Y/U/V plane
+                    dst = src;
+
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+        }
+    }
+
+    return dst;
+}
+
+template std::vector<SimpleTensor<uint8_t>> channel_combine(const TensorShape &shape, const std::vector<SimpleTensor<uint8_t>> &image_planes, Format format);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ChannelCombine.h b/tests/validation/reference/ChannelCombine.h
new file mode 100644
index 0000000..cc6607d
--- /dev/null
+++ b/tests/validation/reference/ChannelCombine.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_CHANNEL_COMBINE_H__
+#define __ARM_COMPUTE_TEST_CHANNEL_COMBINE_H__
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+std::vector<SimpleTensor<T>> channel_combine(const TensorShape &shape, const std::vector<SimpleTensor<T>> &image_planes, Format format);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_CHANNEL_COMBINE_H__ */
diff --git a/tests/validation/reference/ChannelShuffle.cpp b/tests/validation/reference/ChannelShuffle.cpp
new file mode 100644
index 0000000..c4d8d50
--- /dev/null
+++ b/tests/validation/reference/ChannelShuffle.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ChannelShuffle.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+// Refence implementation for channel shuffle taken from https://github.com/pytorch/pytorch/blob/master/caffe2/operators/channel_shuffle_op.h
+template <typename T>
+SimpleTensor<T> channel_shuffle(const SimpleTensor<T> &src, int num_groups)
+{
+    // Create reference
+    SimpleTensor<T> dst{ src.shape(), src.data_type(), src.num_channels(), src.fixed_point_position(), src.quantization_info() };
+
+    const int M                 = src.shape()[0];
+    const int N                 = src.shape()[1];
+    const int num_channels      = src.shape()[2];
+    const int batches           = src.shape()[3];
+    const int MxN               = M * N;
+    const int channels_in_group = num_channels / num_groups;
+
+    const T *src_ref = src.data();
+    T       *dst_ref = dst.data();
+
+    for(int n = 0; n < batches; ++n)
+    {
+        for(int g = 0; g < num_groups; ++g)
+        {
+            // Gather the group g block (of size channels_in_group * MxN) from output channels
+            // g + 0 * G, g + 1 * G, g + 2 * G, g + G * (K - 1) etc.
+            const T *src_ptr = src_ref + g * channels_in_group * MxN + n * num_channels * MxN;
+            T       *dst_ptr = dst_ref + g * MxN + n * num_channels * MxN;
+            for(int i = 0; i < channels_in_group; ++i)
+            {
+                std::copy(src_ptr + i * MxN,
+                          src_ptr + (i + 1) * MxN,
+                          dst_ptr + i * num_groups * MxN);
+            }
+        }
+    }
+
+    return dst;
+}
+
+template SimpleTensor<uint8_t> channel_shuffle(const SimpleTensor<uint8_t> &src, int num_groups);
+template SimpleTensor<uint16_t> channel_shuffle(const SimpleTensor<uint16_t> &src, int num_groups);
+template SimpleTensor<uint32_t> channel_shuffle(const SimpleTensor<uint32_t> &src, int num_groups);
+template SimpleTensor<half> channel_shuffle(const SimpleTensor<half> &src, int num_groups);
+template SimpleTensor<float> channel_shuffle(const SimpleTensor<float> &src, int num_groups);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ChannelShuffle.h b/tests/validation/reference/ChannelShuffle.h
new file mode 100644
index 0000000..52df19e
--- /dev/null
+++ b/tests/validation/reference/ChannelShuffle.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_CHANNEL_SHUFFLE_H__
+#define __ARM_COMPUTE_TEST_CHANNEL_SHUFFLE_H__
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> channel_shuffle(const SimpleTensor<T> &src, int num_groups);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_CHANNEL_SHUFFLE_H__ */
diff --git a/tests/validation/reference/ConvertFullyConnectedWeights.cpp b/tests/validation/reference/ConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..b0f537f
--- /dev/null
+++ b/tests/validation/reference/ConvertFullyConnectedWeights.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ConvertFullyConnectedWeights.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> convert_fully_connected_weights(const SimpleTensor<T> &src, const TensorShape &original_input_shape, const DataLayout training_data_layout)
+{
+    SimpleTensor<T> dst(src.shape(), src.data_type());
+
+    const bool         is_nchw_to_nhwc           = training_data_layout == DataLayout::NCHW;
+    const unsigned int num_elems_per_input_plane = original_input_shape.x() * original_input_shape.y();
+    const unsigned int num_channels              = original_input_shape.z();
+    const unsigned int factor_1                  = is_nchw_to_nhwc ? num_elems_per_input_plane : num_channels;
+    const unsigned int factor_2                  = is_nchw_to_nhwc ? num_channels : num_elems_per_input_plane;
+
+    for(int i = 0; i < src.num_elements(); ++i)
+    {
+        const Coordinates coords_in = index2coords(src.shape(), i);
+        const Coordinates coords_out(coords_in.x(), coords_in.y() % factor_1 * factor_2 + coords_in.y() / factor_1);
+
+        dst[coords2index(dst.shape(), coords_out)] = src[i];
+    }
+
+    return dst;
+}
+
+template SimpleTensor<uint8_t> convert_fully_connected_weights(const SimpleTensor<uint8_t> &src, const TensorShape &original_input_shape,
+                                                               const DataLayout training_data_layout);
+template SimpleTensor<half> convert_fully_connected_weights(const SimpleTensor<half> &src, const TensorShape &original_input_shape,
+                                                            const DataLayout training_data_layout);
+template SimpleTensor<float> convert_fully_connected_weights(const SimpleTensor<float> &src, const TensorShape &original_input_shape,
+                                                             const DataLayout training_data_layout);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ConvertFullyConnectedWeights.h b/tests/validation/reference/ConvertFullyConnectedWeights.h
new file mode 100644
index 0000000..a9bbf13
--- /dev/null
+++ b/tests/validation/reference/ConvertFullyConnectedWeights.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_CONVERT_FULLY_CONNECTED_WEIGHTS_H__
+#define __ARM_COMPUTE_TEST_CONVERT_FULLY_CONNECTED_WEIGHTS_H__
+
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> convert_fully_connected_weights(const SimpleTensor<T> &src, const TensorShape &original_input_shape, const DataLayout training_data_layout);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_CONVERT_FULLY_CONNECTED_WEIGHTS_H__ */
diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h
new file mode 100644
index 0000000..7001758
--- /dev/null
+++ b/tests/validation/reference/Convolution3d.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *asymm_int_mult
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, asymm_int_multDAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__
+#define __ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__
+
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "tests/validation/FixedPoint.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/UtilsQuantizedAsymm.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace convolution_3d
+{
+namespace detail
+{
+inline bool is_valid_pixel(int i, int min, int max)
+{
+    return (i >= min && i < max);
+}
+
+// 3D convolution for floating point type
+template < typename T, typename TB, typename std::enable_if < validation::is_floating_point<T>::value &&validation::is_floating_point<TB>::value, int >::type = 0 >
+inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
+                          int i_offset, int w_offset, int b_offset, int o_offset,
+                          int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1)
+{
+    const T *in_ptr  = in.data() + i_offset;
+    const T *w_ptr   = weights.data() + w_offset;
+    const TB *b_ptr   = bias.data() + b_offset;
+    T        *out_ptr = out.data() + o_offset;
+
+    const int half_width_weights_start  = width_weights / 2;
+    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
+    const int half_height_weights_start = height_weights / 2;
+    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
+
+    // Reset accumulator
+    T acc(0);
+
+    // Compute a 2D convolution for each IFM and accumulate the result
+    for(int ifm = 0; ifm < depth_in; ++ifm)
+    {
+        // Compute the offset for the input slice
+        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
+
+        // Compute 2D convolution
+        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
+        {
+            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
+            {
+                // Check if the pixel is out-of-bound
+                if(is_valid_pixel(xi + xk * dilation_x, 0, width_in) && is_valid_pixel(yi + yk * dilation_y, 0, height_in))
+                {
+                    const int idx = xk + half_width_weights_start;
+                    const int idy = yk + half_height_weights_start;
+
+                    const T i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
+                    const T w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
+
+                    acc += i_value * w_value;
+                }
+            }
+        }
+    }
+
+    // Accumulate the bias and store the result
+    *out_ptr = acc + (*b_ptr);
+}
+
+// 3D convolution for fixed point type
+template < typename T, typename TB, typename std::enable_if < std::is_integral<T>::value &&std::is_integral<TB>::value, int >::type = 0 >
+inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
+                          int i_offset, int w_offset, int b_offset, int o_offset,
+                          int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1)
+{
+    const T *in_ptr               = in.data() + i_offset;
+    const T *w_ptr                = weights.data() + w_offset;
+    const T *b_ptr                = bias.data() + b_offset;
+    T       *out_ptr              = out.data() + o_offset;
+    int      fixed_point_position = in.fixed_point_position();
+
+    const int half_width_weights_start  = width_weights / 2;
+    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
+    const int half_height_weights_start = height_weights / 2;
+    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
+
+    using namespace fixed_point_arithmetic;
+    using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
+
+    // Reset accumulator
+    fixed_point<promoted_type> acc(0, fixed_point_position);
+
+    // Compute a 2D convolution for each IFM and accumulate the result
+    for(int ifm = 0; ifm < depth_in; ++ifm)
+    {
+        // Compute the offset for the input slice
+        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
+
+        // Compute 2D convolution
+        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
+        {
+            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
+            {
+                // Check if the pixel is out-of-bound
+                if(is_valid_pixel(xi + xk * dilation_x, 0, width_in) && is_valid_pixel(yi + yk * dilation_y, 0, height_in))
+                {
+                    const int idx = xk + half_width_weights_start;
+                    const int idy = yk + half_height_weights_start;
+
+                    const fixed_point<promoted_type> i_value(in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in], fixed_point_position, true);
+                    const fixed_point<promoted_type> w_value(w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights], fixed_point_position, true);
+                    const fixed_point<promoted_type> iw = i_value * w_value;
+                    acc                                 = iw + acc;
+                }
+            }
+        }
+    }
+
+    // Get the bias
+    const fixed_point<promoted_type> b(*b_ptr, fixed_point_position, true);
+
+    // Accumulate the bias and covert back
+    acc = acc + b;
+    fixed_point<T> res(acc);
+    *out_ptr = res.raw();
+}
+
+// 3D convolution for QASYMM8 type
+template <>
+inline void convolution3d(const SimpleTensor<uint8_t> &in, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<uint8_t> &out,
+                          int i_offset, int w_offset, int b_offset, int o_offset,
+                          int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x, int dilation_y)
+{
+    const uint8_t *in_ptr  = in.data() + i_offset;
+    const uint8_t *w_ptr   = weights.data() + w_offset;
+    const int32_t *b_ptr   = bias.data() + b_offset;
+    uint8_t       *out_ptr = out.data() + o_offset;
+
+    const int   input_offset   = -in.quantization_info().offset;
+    const float input_scale    = in.quantization_info().scale;
+    const int   weights_offset = -weights.quantization_info().offset;
+    const float weights_scale  = weights.quantization_info().scale;
+    const int   output_offset  = out.quantization_info().offset;
+    const float output_scale   = out.quantization_info().scale;
+
+    int         output_multiplier = 0;
+    int         output_shift      = 0;
+    const float multiplier        = input_scale * weights_scale / output_scale;
+    arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+    const int half_width_weights_start  = width_weights / 2;
+    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
+    const int half_height_weights_start = height_weights / 2;
+    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
+
+    // Reset accumulator
+    int32_t acc(0);
+
+    // Compute a 2D convolution for each IFM and accumulate the result
+    for(int ifm = 0; ifm < depth_in; ++ifm)
+    {
+        // Compute the offset for the input slice
+        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
+
+        // Compute 2D convolution
+        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
+        {
+            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
+            {
+                // Check if the pixel is out-of-bound
+                if(is_valid_pixel(xi + xk * dilation_x, 0, width_in) && is_valid_pixel(yi + yk * dilation_y, 0, height_in))
+                {
+                    const int idx = xk + half_width_weights_start;
+                    const int idy = yk + half_height_weights_start;
+
+                    const uint8_t i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
+                    const uint8_t w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
+
+                    acc += (i_value + input_offset) * (w_value + weights_offset);
+                }
+            }
+        }
+    }
+
+    // Accumulate the bias
+    acc += (*b_ptr);
+
+    acc = validation::asymm_rounding_divide_by_pow2(validation::asymm_int_mult(acc, output_multiplier), output_shift);
+    acc += output_offset;
+    acc = utility::clamp<int32_t>(acc, 0, 255);
+
+    // Store the result
+    *out_ptr = acc;
+}
+} // namespace detail
+} // namespace convolution_3d
+} // namespace test
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__ */
diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index b7ed2f5..fe558ba 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp
@@ -25,6 +25,8 @@
 
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Convolution3d.h"
+#include "tests/validation/reference/Permute.h"
 #include "tests/validation/reference/Utils.h"
 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
 
@@ -42,193 +44,12 @@
 {
 namespace
 {
-inline bool is_valid_pixel(int i, int min, int max)
-{
-    return (i >= min && i < max);
-}
-
-// 3D convolution for floating point type
-template < typename T, typename TB, typename std::enable_if < is_floating_point<T>::value &&is_floating_point<TB>::value, int >::type = 0 >
-void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
-                   int i_offset, int w_offset, int b_offset, int o_offset,
-                   int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights)
-{
-    const T *in_ptr  = in.data() + i_offset;
-    const T *w_ptr   = weights.data() + w_offset;
-    const TB *b_ptr   = bias.data() + b_offset;
-    T        *out_ptr = out.data() + o_offset;
-
-    const int half_width_weights_start  = width_weights / 2;
-    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
-    const int half_height_weights_start = height_weights / 2;
-    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
-
-    // Reset accumulator
-    T acc(0);
-
-    // Compute a 2D convolution for each IFM and accumulate the result
-    for(int ifm = 0; ifm < depth_in; ++ifm)
-    {
-        // Compute the offset for the input slice
-        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
-
-        // Compute 2D convolution
-        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
-        {
-            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
-            {
-                // Check if the pixel is out-of-bound
-                if(is_valid_pixel(xi + xk, 0, width_in) && is_valid_pixel(yi + yk, 0, height_in))
-                {
-                    const int idx = xk + half_width_weights_start;
-                    const int idy = yk + half_height_weights_start;
-
-                    const T i_value = in_ptr[offset_slice_in + xk + yk * width_in];
-                    const T w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
-
-                    acc += i_value * w_value;
-                }
-            }
-        }
-    }
-
-    // Accumulate the bias and store the result
-    *out_ptr = acc + (*b_ptr);
-}
-
-// 3D convolution for fixed point type
-template < typename T, typename TB, typename std::enable_if < std::is_integral<T>::value &&std::is_integral<TB>::value, int >::type = 0 >
-void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
-                   int i_offset, int w_offset, int b_offset, int o_offset,
-                   int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights)
-{
-    const T *in_ptr               = in.data() + i_offset;
-    const T *w_ptr                = weights.data() + w_offset;
-    const T *b_ptr                = bias.data() + b_offset;
-    T       *out_ptr              = out.data() + o_offset;
-    int      fixed_point_position = in.fixed_point_position();
-
-    const int half_width_weights_start  = width_weights / 2;
-    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
-    const int half_height_weights_start = height_weights / 2;
-    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
-
-    using namespace fixed_point_arithmetic;
-    using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
-
-    // Reset accumulator
-    fixed_point<promoted_type> acc(0, fixed_point_position);
-
-    // Compute a 2D convolution for each IFM and accumulate the result
-    for(int ifm = 0; ifm < depth_in; ++ifm)
-    {
-        // Compute the offset for the input slice
-        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
-
-        // Compute 2D convolution
-        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
-        {
-            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
-            {
-                // Check if the pixel is out-of-bound
-                if(is_valid_pixel(xi + xk, 0, width_in) && is_valid_pixel(yi + yk, 0, height_in))
-                {
-                    const int idx = xk + half_width_weights_start;
-                    const int idy = yk + half_height_weights_start;
-
-                    const fixed_point<promoted_type> i_value(in_ptr[offset_slice_in + xk + yk * width_in], fixed_point_position, true);
-                    const fixed_point<promoted_type> w_value(w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights], fixed_point_position, true);
-                    const fixed_point<promoted_type> iw = i_value * w_value;
-                    acc                                 = iw + acc;
-                }
-            }
-        }
-    }
-
-    // Get the bias
-    const fixed_point<promoted_type> b(*b_ptr, fixed_point_position, true);
-
-    // Accumulate the bias and covert back
-    acc = acc + b;
-    fixed_point<T> res(acc);
-    *out_ptr = res.raw();
-}
-
-// 3D convolution for QASYMM8 type
-template <>
-void convolution3d(const SimpleTensor<uint8_t> &in, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<uint8_t> &out,
-                   int i_offset, int w_offset, int b_offset, int o_offset,
-                   int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights)
-{
-    const uint8_t *in_ptr  = in.data() + i_offset;
-    const uint8_t *w_ptr   = weights.data() + w_offset;
-    const int32_t *b_ptr   = bias.data() + b_offset;
-    uint8_t       *out_ptr = out.data() + o_offset;
-
-    const int   input_offset   = -in.quantization_info().offset;
-    const float input_scale    = in.quantization_info().scale;
-    const int   weights_offset = -weights.quantization_info().offset;
-    const float weights_scale  = weights.quantization_info().scale;
-    const int   output_offset  = out.quantization_info().offset;
-    const float output_scale   = out.quantization_info().scale;
-
-    int         output_multiplier = 0;
-    int         output_shift      = 0;
-    const float multiplier        = input_scale * weights_scale / output_scale;
-    arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
-    const int half_width_weights_start  = width_weights / 2;
-    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
-    const int half_height_weights_start = height_weights / 2;
-    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
-
-    // Reset accumulator
-    int32_t acc(0);
-
-    // Compute a 2D convolution for each IFM and accumulate the result
-    for(int ifm = 0; ifm < depth_in; ++ifm)
-    {
-        // Compute the offset for the input slice
-        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
-
-        // Compute 2D convolution
-        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
-        {
-            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
-            {
-                // Check if the pixel is out-of-bound
-                if(is_valid_pixel(xi + xk, 0, width_in) && is_valid_pixel(yi + yk, 0, height_in))
-                {
-                    const int idx = xk + half_width_weights_start;
-                    const int idy = yk + half_height_weights_start;
-
-                    const uint8_t i_value = in_ptr[offset_slice_in + xk + yk * width_in];
-                    const uint8_t w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
-
-                    acc += (i_value + input_offset) * (w_value + weights_offset);
-                }
-            }
-        }
-    }
-
-    // Accumulate the bias
-    acc += (*b_ptr);
-
-    acc = asymm_rounding_divide_by_pow2(asymm_int_mult(acc, output_multiplier), output_shift);
-    acc += output_offset;
-    acc = utility::clamp<int32_t>(acc, 0, 255);
-
-    // Store the result
-    *out_ptr = acc;
-}
 } // namespace
 
 template <typename T, typename TB>
-SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info)
+SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, const PadStrideInfo &info,
+                                       const Size2D &dilation)
 {
-    // Create reference
-    SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
-
     // Compute reference
     const int width_in       = src.shape().x();
     const int height_in      = src.shape().y();
@@ -244,10 +65,10 @@
     const int stride_xi      = info.stride().first;
     const int stride_yi      = info.stride().second;
 
-    auto output_wh = scaled_dimensions(width_in, height_in, width_weights, height_weights, info);
+    auto output_wh = scaled_dimensions(width_in, height_in, width_weights, height_weights, info, dilation);
 
-    const int start_xi    = width_weights / 2 - pad_left;
-    const int start_yi    = height_weights / 2 - pad_top;
+    const int start_xi    = (dilation.x() * (width_weights - 1) + 1) / 2 - pad_left;
+    const int start_yi    = (dilation.y() * (height_weights - 1) + 1) / 2 - pad_top;
     const int end_xi      = output_wh.first * stride_xi;
     const int end_yi      = output_wh.second * stride_yi;
     const int num_batches = src.shape().total_size() / (width_in * height_in * depth_in);
@@ -270,11 +91,11 @@
                     ARM_COMPUTE_ASSERT(yo < height_out);
 
                     // Compute 3D convolution
-                    convolution3d(src, weights, bias, dst,
-                                  offset_in, ofm * width_weights * height_weights * depth_weights, ofm, offset_out,
-                                  xi, yi,
-                                  width_in, height_in, depth_in,
-                                  width_weights, height_weights);
+                    convolution_3d::detail::convolution3d(src, weights, bias, dst,
+                                                          offset_in, ofm * width_weights * height_weights * depth_weights, ofm, offset_out,
+                                                          xi, yi,
+                                                          width_in, height_in, depth_in,
+                                                          width_weights, height_weights, dilation.x(), dilation.y());
                 }
             }
         }
@@ -282,18 +103,38 @@
 
     return dst;
 }
+template <typename T, typename TB>
+SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
+                                  const Size2D &dilation)
+{
+    // Create reference
+    SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
+
+    if(src.data_layout() == DataLayout::NHWC)
+    {
+        SimpleTensor<T> src_nchw     = reference::permute<T>(src, PermutationVector(1U, 2U, 0U));
+        SimpleTensor<T> weights_nchw = reference::permute<T>(weights, PermutationVector(1U, 2U, 0U));
+        SimpleTensor<T> dst_nchw     = reference::permute<T>(dst, PermutationVector(1U, 2U, 0U));
+
+        return reference::permute<T>(convolution_layer_nchw(src_nchw, weights_nchw, bias, dst_nchw, info, dilation), PermutationVector(2U, 0U, 1U));
+    }
+    else
+    {
+        return convolution_layer_nchw(src, weights, bias, dst, info, dilation);
+    }
+}
 
 template SimpleTensor<float> convolution_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
-                                               const PadStrideInfo &info);
+                                               const PadStrideInfo &info, const Size2D &dilation);
 template SimpleTensor<half> convolution_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, const TensorShape &output_shape,
-                                              const PadStrideInfo &info);
+                                              const PadStrideInfo &info, const Size2D &dilation);
 template SimpleTensor<qint8_t> convolution_layer(const SimpleTensor<qint8_t> &src, const SimpleTensor<qint8_t> &weights, const SimpleTensor<qint8_t> &bias, const TensorShape &output_shape,
-                                                 const PadStrideInfo &info);
+                                                 const PadStrideInfo &info, const Size2D &dilation);
 template SimpleTensor<qint16_t> convolution_layer(const SimpleTensor<qint16_t> &src, const SimpleTensor<qint16_t> &weights, const SimpleTensor<qint16_t> &bias, const TensorShape &output_shape,
-                                                  const PadStrideInfo &info);
+                                                  const PadStrideInfo &info, const Size2D &dilation);
 template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
-                                                 const PadStrideInfo &info);
+                                                 const PadStrideInfo &info, const Size2D &dilation);
 } // namespace reference
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/reference/ConvolutionLayer.h b/tests/validation/reference/ConvolutionLayer.h
index 57455ba..ff3b153 100644
--- a/tests/validation/reference/ConvolutionLayer.h
+++ b/tests/validation/reference/ConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,8 @@
 namespace reference
 {
 template <typename T, typename TB>
-SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info);
+SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
+                                  const Size2D &dilation = Size2D(1U, 1U));
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Copy.cpp b/tests/validation/reference/Copy.cpp
new file mode 100644
index 0000000..dc519a4
--- /dev/null
+++ b/tests/validation/reference/Copy.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Copy.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> copy(const SimpleTensor<T> &src, const TensorShape &output_shape)
+{
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(src.shape(), output_shape);
+
+    SimpleTensor<T> dst(output_shape, src.data_type());
+    std::copy_n(src.data(), src.num_elements(), dst.data());
+    return dst;
+}
+
+template SimpleTensor<uint8_t> copy(const SimpleTensor<uint8_t> &src, const TensorShape &output_shape);
+template SimpleTensor<int8_t> copy(const SimpleTensor<int8_t> &src, const TensorShape &output_shape);
+template SimpleTensor<uint16_t> copy(const SimpleTensor<uint16_t> &src, const TensorShape &output_shape);
+template SimpleTensor<int16_t> copy(const SimpleTensor<int16_t> &src, const TensorShape &output_shape);
+template SimpleTensor<uint32_t> copy(const SimpleTensor<uint32_t> &src, const TensorShape &output_shape);
+template SimpleTensor<int32_t> copy(const SimpleTensor<int32_t> &src, const TensorShape &output_shape);
+template SimpleTensor<half> copy(const SimpleTensor<half> &src, const TensorShape &output_shape);
+template SimpleTensor<float> copy(const SimpleTensor<float> &src, const TensorShape &output_shape);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/Copy.h b/tests/validation/reference/Copy.h
new file mode 100644
index 0000000..362af03
--- /dev/null
+++ b/tests/validation/reference/Copy.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_COPY_H__
+#define __ARM_COMPUTE_TEST_COPY_H__
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> copy(const SimpleTensor<T> &src, const TensorShape &output_shape);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_COPY_H__ */
diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.cpp b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
index b2a7067..10c617e 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
@@ -50,9 +50,9 @@
  *
  */
 template <typename T, typename TB>
-SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info)
+SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info,
+                                      unsigned int depth_multiplier)
 {
-    // Create reference
     SimpleTensor<T> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position() };
 
     // Compute reference
@@ -77,33 +77,39 @@
     const int maximum_x = input_width + pad_left - filter_half_width + pad_right - filter_half_width;
     const int maximum_y = input_height + pad_top - filter_half_height + pad_bottom - filter_half_height;
 
+    const T border_value(0);
+
     int out_pos = 0;
     for(int r = 0; r < num_batches; ++r)
     {
         for(int z = 0; z < input_depth; ++z)
         {
-            for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
+            for(unsigned int m = 0; m < depth_multiplier; ++m)
             {
-                for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
-                {
-                    Coordinates coords(static_cast<int>(x), static_cast<int>(y), static_cast<int>(z), static_cast<int>(r));
-                    size_t      filter_offset = filter_plane * z;
+                const int out_z = z * depth_multiplier + m;
 
-                    T val(0);
-                    for(int j = y - filter_half_height; j <= static_cast<int>(y + filter_half_height); ++j)
+                for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
+                {
+                    for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
                     {
-                        for(int i = x - filter_half_width; i <= static_cast<int>(x + filter_half_width); ++i)
+                        Coordinates coords(static_cast<int>(x), static_cast<int>(y), static_cast<int>(z), static_cast<int>(r));
+                        size_t      filter_offset = filter_plane * out_z;
+
+                        T val(0);
+                        for(int j = y - filter_half_height; j <= static_cast<int>(y + filter_half_height); ++j)
                         {
-                            coords.set(0, i);
-                            coords.set(1, j);
-                            T border_value(0);
-                            val += *(weights.data() + filter_offset) * tensor_elem_at(src, coords, BorderMode::CONSTANT, border_value);
-                            ++filter_offset;
+                            for(int i = x - filter_half_width; i <= static_cast<int>(x + filter_half_width); ++i)
+                            {
+                                coords.set(0, i);
+                                coords.set(1, j);
+
+                                val += *(weights.data() + filter_offset) * tensor_elem_at(src, coords, BorderMode::CONSTANT, border_value);
+                                ++filter_offset;
+                            }
                         }
+
+                        dst[out_pos++] = saturate_cast<T>(val + *static_cast<const TB *>(biases(Coordinates(out_z))));
                     }
-                    coords.set(0, x);
-                    coords.set(1, y);
-                    dst[out_pos++] = saturate_cast<T>(val + *static_cast<const TB *>(biases(Coordinates(z))));
                 }
             }
         }
@@ -114,11 +120,11 @@
 
 template <>
 SimpleTensor<uint8_t> depthwise_convolution(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &biases, const TensorShape &dst_shape,
-                                            const PadStrideInfo &conv_info)
+                                            const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
-    // Create reference
     SimpleTensor<uint8_t> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
 
+    // Create reference
     const int   input_offset   = -src.quantization_info().offset;
     const float input_scale    = src.quantization_info().scale;
     const int   weights_offset = -weights.quantization_info().offset;
@@ -158,35 +164,40 @@
     {
         for(int z = 0; z < input_depth; ++z)
         {
-            int32_t bias_val = *static_cast<const int32_t *>(biases(Coordinates(z)));
-            for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
+            for(unsigned int m = 0; m < depth_multiplier; ++m)
             {
-                for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
+                const int     out_z    = z * depth_multiplier + m;
+                const int32_t bias_val = *static_cast<const int32_t *>(biases(Coordinates(out_z)));
+
+                for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
                 {
-                    Coordinates coords(x, y, z, r);
-                    int         filter_offset = filter_plane * z;
-
-                    int32_t val = 0;
-                    for(int j = y - filter_half_height; j <= (y + filter_half_height); ++j)
+                    for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
                     {
-                        for(int i = x - filter_half_width; i <= (x + filter_half_width); ++i)
-                        {
-                            coords.set(0, i);
-                            coords.set(1, j);
-                            auto    in_val = tensor_elem_at<uint8_t>(src, coords, BorderMode::CONSTANT, -input_offset);
-                            uint8_t w_val  = *(weights.data() + filter_offset);
-                            val += (in_val + input_offset) * (w_val + weights_offset);
-                            ++filter_offset;
-                        }
-                    }
-                    val += bias_val;
-                    val = asymm_rounding_divide_by_pow2(asymm_int_mult(val, output_multiplier), output_shift);
-                    val += output_offset;
-                    val = std::max<int32_t>(val, 0);
-                    val = std::min<int32_t>(val, 255);
+                        Coordinates coords(x, y, z, r);
+                        int         filter_offset = filter_plane * out_z;
 
-                    // Store the result
-                    dst[out_pos++] = val;
+                        int32_t val = 0;
+                        for(int j = y - filter_half_height; j <= (y + filter_half_height); ++j)
+                        {
+                            for(int i = x - filter_half_width; i <= (x + filter_half_width); ++i)
+                            {
+                                coords.set(0, i);
+                                coords.set(1, j);
+                                const auto    in_val = tensor_elem_at<uint8_t>(src, coords, BorderMode::CONSTANT, -input_offset);
+                                const uint8_t w_val  = *(weights.data() + filter_offset);
+                                val += (in_val + input_offset) * (w_val + weights_offset);
+                                ++filter_offset;
+                            }
+                        }
+                        val += bias_val;
+                        val = asymm_rounding_divide_by_pow2(asymm_int_mult(val, output_multiplier), output_shift);
+                        val += output_offset;
+                        val = std::max<int32_t>(val, 0);
+                        val = std::min<int32_t>(val, 255);
+
+                        // Store the result
+                        dst[out_pos++] = val;
+                    }
                 }
             }
         }
@@ -196,10 +207,10 @@
 }
 
 template SimpleTensor<float> depthwise_convolution(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &biases, const TensorShape &dst_shape,
-                                                   const PadStrideInfo &conv_info);
+                                                   const PadStrideInfo &conv_info, unsigned int depth_multiplier);
 
 template SimpleTensor<half> depthwise_convolution(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &biases, const TensorShape &dst_shape,
-                                                  const PadStrideInfo &conv_info);
+                                                  const PadStrideInfo &conv_info, unsigned int depth_multiplier);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.h b/tests/validation/reference/DepthwiseConvolutionLayer.h
index df743a5..bab3387 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.h
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,8 @@
 namespace reference
 {
 template <typename T, typename TB>
-SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info);
+SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info,
+                                      unsigned int depth_multiplier);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp b/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp
index ca6c168..8bc6ddb 100644
--- a/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@
                                                       const SimpleTensor<T> &pointwise_biases, const TensorShape &dst_shape, const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
 {
     // Compute reference
-    SimpleTensor<T> depthwise_out = depthwise_convolution(src, depthwise_weights, depthwise_biases, depthwise_out_shape, depthwise_conv_info);
+    SimpleTensor<T> depthwise_out = depthwise_convolution(src, depthwise_weights, depthwise_biases, depthwise_out_shape, depthwise_conv_info, 1);
     SimpleTensor<T> dst           = convolution_layer(depthwise_out, pointwise_weights, pointwise_biases, dst_shape, pointwise_conv_info);
 
     return dst;
diff --git a/tests/validation/reference/FlattenLayer.cpp b/tests/validation/reference/FlattenLayer.cpp
index 611701d..44f4d93 100644
--- a/tests/validation/reference/FlattenLayer.cpp
+++ b/tests/validation/reference/FlattenLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,12 +34,8 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> flatten_layer(const SimpleTensor<T> &src)
+SimpleTensor<T> flatten_layer(const SimpleTensor<T> &src, const TensorShape &shape_flatten)
 {
-    TensorShape shape_flatten(src.shape());
-    shape_flatten.set(0, src.shape()[0] * src.shape()[1] * src.shape()[2]);
-    shape_flatten.remove_dimension(1);
-    shape_flatten.remove_dimension(1);
     SimpleTensor<T> dst(shape_flatten, src.data_type(), 1, src.fixed_point_position());
 
     // Note: Since the reference implementation does not use padding bytes, we can copy directly the content of the source tensor
@@ -48,10 +44,10 @@
     return dst;
 }
 
-template SimpleTensor<float> flatten_layer(const SimpleTensor<float> &src);
-template SimpleTensor<half> flatten_layer(const SimpleTensor<half> &src);
-template SimpleTensor<qint8_t> flatten_layer(const SimpleTensor<qint8_t> &src);
-template SimpleTensor<qint16_t> flatten_layer(const SimpleTensor<qint16_t> &src);
+template SimpleTensor<float> flatten_layer(const SimpleTensor<float> &src, const TensorShape &shape_flatten);
+template SimpleTensor<half> flatten_layer(const SimpleTensor<half> &src, const TensorShape &shape_flatten);
+template SimpleTensor<qint8_t> flatten_layer(const SimpleTensor<qint8_t> &src, const TensorShape &shape_flatten);
+template SimpleTensor<qint16_t> flatten_layer(const SimpleTensor<qint16_t> &src, const TensorShape &shape_flatten);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/FlattenLayer.h b/tests/validation/reference/FlattenLayer.h
index b1286fe..5ccd429 100644
--- a/tests/validation/reference/FlattenLayer.h
+++ b/tests/validation/reference/FlattenLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,7 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> flatten_layer(const SimpleTensor<T> &src);
+SimpleTensor<T> flatten_layer(const SimpleTensor<T> &src, const TensorShape &shape_flatten);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp
index 77d025e..f9dcfcb 100644
--- a/tests/validation/reference/GEMM.cpp
+++ b/tests/validation/reference/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,23 +41,44 @@
     SimpleTensor<T> dst{ c.shape(), c.data_type(), 1, c.fixed_point_position() };
 
     // Compute reference
-    const int M = dst.shape().y();
-    const int N = dst.shape().x();
+    const int M = a.shape().y();
+    const int N = b.shape().x();
     const int K = a.shape().x();
+    const int D = a.shape().z(); // Number of matrices in a batch
+    const int W = a.shape()[3];  // Number of batched-gemm (Winograd case)
 
-    for(int row = 0; row < M; ++row)
+    const int a_stride_z = K * M;
+    const int a_stride_w = K * M * D;
+
+    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    const int c_stride_z = N * M;
+    const int c_stride_w = N * M * D;
+
+    for(int w = 0; w < W; ++w)
     {
-        for(int col = 0; col < N; ++col)
+        for(int depth = 0; depth < D; ++depth)
         {
-            T acc(0);
+            const int base_addr_a = depth * a_stride_z + w * a_stride_w;
+            const int base_addr_b = depth * b_stride_z + w * b_stride_w;
+            const int base_addr_c = depth * c_stride_z + w * c_stride_w;
 
-            for(int k = 0; k < K; ++k)
+            for(int row = 0; row < M; ++row)
             {
-                acc += a[row * K + k] * b[k * N + col];
-            }
+                for(int col = 0; col < N; ++col)
+                {
+                    T acc(0);
 
-            // Finalize the result: alpha * A * B + beta * C
-            dst[col + row * N] = alpha * acc + beta * c[col + row * N];
+                    for(int k = 0; k < K; ++k)
+                    {
+                        acc += a[base_addr_a + k + row * K] * b[base_addr_b + col + k * N];
+                    }
+
+                    // Finalize the result: alpha * A * B + beta * C
+                    dst[base_addr_c + col + row * N] = alpha * acc + beta * c[base_addr_c + col + row * N];
+                }
+            }
         }
     }
 
@@ -75,37 +96,58 @@
     // Compute reference
     using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
 
-    const int M                    = dst.shape().y();
-    const int N                    = dst.shape().x();
-    const int K                    = a.shape().x();
-    const int fixed_point_position = a.fixed_point_position();
+    const int M = dst.shape().y();
+    const int N = dst.shape().x();
+    const int K = a.shape().x();
+    const int D = a.shape().z(); // Number of matrices in a batch
+    const int W = a.shape()[3];  // Number of batched-gemm (Winograd case)
 
+    const int a_stride_z = K * M;
+    const int a_stride_w = K * M * D;
+
+    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    const int c_stride_z = N * M;
+    const int c_stride_w = N * M * D;
+
+    const int            fixed_point_position = a.fixed_point_position();
     const fixed_point<T> alpha_q(alpha, fixed_point_position);
     const fixed_point<T> beta_q(beta, fixed_point_position);
 
-    for(int row = 0; row < M; ++row)
+    for(int w = 0; w < W; ++w)
     {
-        for(int col = 0; col < N; ++col)
+        for(int depth = 0; depth < D; ++depth)
         {
-            fixed_point<promoted_type> acc_q(0, fixed_point_position);
+            const int base_addr_a = depth * a_stride_z + w * a_stride_w;
+            const int base_addr_b = depth * b_stride_z + w * b_stride_w;
+            const int base_addr_c = depth * c_stride_z + w * c_stride_w;
 
-            for(int k = 0; k < K; ++k)
+            for(int row = 0; row < M; ++row)
             {
-                const fixed_point<promoted_type> a0_q(a[row * K + k], fixed_point_position, true);
-                const fixed_point<promoted_type> b0_q(b[k * N + col], fixed_point_position, true);
+                for(int col = 0; col < N; ++col)
+                {
+                    fixed_point<promoted_type> acc_q(0, fixed_point_position);
 
-                acc_q = acc_q + (a0_q * b0_q);
+                    for(int k = 0; k < K; ++k)
+                    {
+                        const fixed_point<promoted_type> a0_q(a[base_addr_a + row * K + k], fixed_point_position, true);
+                        const fixed_point<promoted_type> b0_q(b[base_addr_b + k * N + col], fixed_point_position, true);
+
+                        acc_q = acc_q + (a0_q * b0_q);
+                    }
+
+                    // Finalize the result: alpha * A * B + beta * C
+                    const fixed_point<T> c0_q(c[base_addr_c + col + row * N], fixed_point_position, true);
+
+                    fixed_point<T> res_q(acc_q);
+                    res_q = alpha_q * res_q;
+                    res_q = res_q + (beta_q * c0_q);
+
+                    // Store the result
+                    dst[base_addr_c + col + row * N] = res_q.raw();
+                }
             }
-
-            // Finalize the result: alpha * A * B + beta * C
-            const fixed_point<T> c0_q(c[col + row * N], fixed_point_position, true);
-
-            fixed_point<T> res_q(acc_q);
-            res_q = alpha_q * res_q;
-            res_q = res_q + (beta_q * c0_q);
-
-            // Store the result
-            dst[col + row * N] = res_q.raw();
         }
     }
 
diff --git a/tests/validation/reference/GaussianPyramidHalf.cpp b/tests/validation/reference/GaussianPyramidHalf.cpp
index 0a68ded..7d5eb07 100644
--- a/tests/validation/reference/GaussianPyramidHalf.cpp
+++ b/tests/validation/reference/GaussianPyramidHalf.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/reference/HOGDescriptor.cpp b/tests/validation/reference/HOGDescriptor.cpp
index 105eb83..ed22695 100644
--- a/tests/validation/reference/HOGDescriptor.cpp
+++ b/tests/validation/reference/HOGDescriptor.cpp
@@ -255,6 +255,8 @@
     return desc;
 }
 
+template void hog_orientation_binning(const SimpleTensor<int16_t> &mag, const SimpleTensor<uint8_t> &phase, SimpleTensor<float> &hog_space, const HOGInfo &hog_info);
+template void hog_block_normalization(SimpleTensor<float> &desc, const SimpleTensor<float> &hog_space, const HOGInfo &hog_info);
 template SimpleTensor<float> hog_descriptor(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value, const HOGInfo &hog_info);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/HOGDescriptor.h b/tests/validation/reference/HOGDescriptor.h
index e886445..6ea83fe 100644
--- a/tests/validation/reference/HOGDescriptor.h
+++ b/tests/validation/reference/HOGDescriptor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,12 @@
 {
 namespace reference
 {
+template <typename T, typename U, typename V>
+void hog_orientation_binning(const SimpleTensor<T> &mag, const SimpleTensor<U> &phase, SimpleTensor<V> &hog_space, const HOGInfo &hog_info);
+
+template <typename T>
+void hog_block_normalization(SimpleTensor<T> &desc, const SimpleTensor<T> &hog_space, const HOGInfo &hog_info);
+
 template <typename T, typename U>
 SimpleTensor<T> hog_descriptor(const SimpleTensor<U> &src, BorderMode border_mode, U constant_border_value, const HOGInfo &hog_info);
 } // namespace reference
diff --git a/tests/validation/reference/HOGDetector.cpp b/tests/validation/reference/HOGDetector.cpp
new file mode 100644
index 0000000..5a5ae37
--- /dev/null
+++ b/tests/validation/reference/HOGDetector.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "HOGDetector.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+/** Computes the number of detection windows to iterate over in the feature vector. */
+Size2D num_detection_windows(const TensorShape &shape, const Size2D &window_step, const HOGInfo &hog_info)
+{
+    const size_t num_block_strides_width  = hog_info.detection_window_size().width / hog_info.block_stride().width;
+    const size_t num_block_strides_height = hog_info.detection_window_size().height / hog_info.block_stride().height;
+
+    return Size2D(floor_to_multiple(shape.x() - num_block_strides_width, window_step.width) + window_step.width,
+                  floor_to_multiple(shape.y() - num_block_strides_height, window_step.height) + window_step.height);
+}
+} // namespace
+
+template <typename T>
+std::vector<DetectionWindow> hog_detector(const SimpleTensor<T> &src, const std::vector<T> &descriptor, unsigned int max_num_detection_windows,
+                                          const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold, uint16_t idx_class)
+{
+    ARM_COMPUTE_ERROR_ON_MSG((detection_window_stride.width % hog_info.block_stride().width != 0),
+                             "Detection window stride width must be multiple of block stride width");
+    ARM_COMPUTE_ERROR_ON_MSG((detection_window_stride.height % hog_info.block_stride().height != 0),
+                             "Detection window stride height must be multiple of block stride height");
+
+    // Create vector for identifying each detection window
+    std::vector<DetectionWindow> windows;
+
+    // Calculate detection window step
+    const Size2D window_step(detection_window_stride.width / hog_info.block_stride().width,
+                             detection_window_stride.height / hog_info.block_stride().height);
+
+    // Calculate number of detection windows
+    const Size2D num_windows = num_detection_windows(src.shape(), window_step, hog_info);
+
+    // Calculate detection window and row offsets in feature vector
+    const size_t src_offset_x   = window_step.width * hog_info.num_bins() * hog_info.num_cells_per_block().area();
+    const size_t src_offset_y   = window_step.height * hog_info.num_bins() * hog_info.num_cells_per_block().area() * src.shape().x();
+    const size_t src_offset_row = src.num_channels() * src.shape().x();
+
+    // Calculate detection window attributes
+    const Size2D       num_block_positions_per_detection_window = hog_info.num_block_positions_per_image(hog_info.detection_window_size());
+    const unsigned int num_bins_per_descriptor_x                = num_block_positions_per_detection_window.width * src.num_channels();
+    const unsigned int num_blocks_per_descriptor_y              = num_block_positions_per_detection_window.height;
+
+    ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog_info.descriptor_size());
+
+    size_t win_id = 0;
+
+    // Traverse feature vector in detection window steps
+    for(auto win_y = 0u, offset_y = 0u; win_y < num_windows.height; win_y += window_step.height, offset_y += src_offset_y)
+    {
+        for(auto win_x = 0u, offset_x = 0u; win_x < num_windows.width; win_x += window_step.width, offset_x += src_offset_x)
+        {
+            // Reset the score
+            float score = 0.0f;
+
+            // Traverse detection window
+            for(auto y = 0u, offset_row = 0u; y < num_blocks_per_descriptor_y; ++y, offset_row += src_offset_row)
+            {
+                const int bin_offset = y * num_bins_per_descriptor_x;
+
+                for(auto x = 0u; x < num_bins_per_descriptor_x; ++x)
+                {
+                    // Compute Linear SVM
+                    const float a = src[x + offset_x + offset_y + offset_row];
+                    const float b = descriptor[x + bin_offset];
+                    score += a * b;
+                }
+            }
+
+            // Add the bias. The bias is located at the position (descriptor_size() - 1)
+            score += descriptor[num_bins_per_descriptor_x * num_blocks_per_descriptor_y];
+
+            if(score > threshold)
+            {
+                DetectionWindow window;
+
+                if(win_id++ < max_num_detection_windows)
+                {
+                    window.x         = win_x * hog_info.block_stride().width;
+                    window.y         = win_y * hog_info.block_stride().height;
+                    window.width     = hog_info.detection_window_size().width;
+                    window.height    = hog_info.detection_window_size().height;
+                    window.idx_class = idx_class;
+                    window.score     = score;
+
+                    windows.push_back(window);
+                }
+            }
+        }
+    }
+
+    return windows;
+}
+
+template std::vector<DetectionWindow> hog_detector(const SimpleTensor<float> &src, const std::vector<float> &descriptor, unsigned int max_num_detection_windows,
+                                                   const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold, uint16_t idx_class);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/HOGDetector.h b/tests/validation/reference/HOGDetector.h
new file mode 100644
index 0000000..e88acb8
--- /dev/null
+++ b/tests/validation/reference/HOGDetector.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_HOG_DETECTOR_H__
+#define __ARM_COMPUTE_TEST_HOG_DETECTOR_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "tests/SimpleTensor.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+std::vector<DetectionWindow> hog_detector(const SimpleTensor<T> &src, const std::vector<T> &descriptor, unsigned int max_num_detection_windows,
+                                          const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold = 0.0f, uint16_t idx_class = 0);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_HOG_DETECTOR_H__ */
diff --git a/tests/validation/reference/HOGMultiDetection.cpp b/tests/validation/reference/HOGMultiDetection.cpp
new file mode 100644
index 0000000..2f5e439
--- /dev/null
+++ b/tests/validation/reference/HOGMultiDetection.cpp
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "HOGMultiDetection.h"
+
+#include "Derivative.h"
+#include "HOGDescriptor.h"
+#include "HOGDetector.h"
+#include "Magnitude.h"
+#include "Phase.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+void validate_models(const std::vector<HOGInfo> &models)
+{
+    ARM_COMPUTE_ERROR_ON(0 == models.size());
+
+    for(size_t i = 1; i < models.size(); ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(models[0].phase_type() != models[i].phase_type(),
+                                 "All HOG parameters must have the same phase type");
+
+        ARM_COMPUTE_ERROR_ON_MSG(models[0].normalization_type() != models[i].normalization_type(),
+                                 "All HOG parameters must have the same normalization_type");
+
+        ARM_COMPUTE_ERROR_ON_MSG((models[0].l2_hyst_threshold() != models[i].l2_hyst_threshold()) && (models[0].normalization_type() == arm_compute::HOGNormType::L2HYS_NORM),
+                                 "All HOG parameters must have the same l2 hysteresis threshold if you use L2 hysteresis normalization type");
+    }
+}
+} // namespace
+
+void detection_windows_non_maxima_suppression(std::vector<DetectionWindow> &multi_windows, float min_distance)
+{
+    const size_t num_candidates = multi_windows.size();
+    size_t       num_detections = 0;
+
+    // Sort by idx_class first and by score second
+    std::sort(multi_windows.begin(), multi_windows.end(), [](const DetectionWindow & lhs, const DetectionWindow & rhs)
+    {
+        if(lhs.idx_class < rhs.idx_class)
+        {
+            return true;
+        }
+        if(rhs.idx_class < lhs.idx_class)
+        {
+            return false;
+        }
+
+        // idx_classes are equal so compare by score
+        if(lhs.score > rhs.score)
+        {
+            return true;
+        }
+        if(rhs.score > lhs.score)
+        {
+            return false;
+        }
+
+        return false;
+    });
+
+    const float min_distance_pow2 = min_distance * min_distance;
+
+    // Euclidean distance
+    for(size_t i = 0; i < num_candidates; ++i)
+    {
+        if(0.0f != multi_windows.at(i).score)
+        {
+            DetectionWindow cur;
+            cur.x         = multi_windows.at(i).x;
+            cur.y         = multi_windows.at(i).y;
+            cur.width     = multi_windows.at(i).width;
+            cur.height    = multi_windows.at(i).height;
+            cur.idx_class = multi_windows.at(i).idx_class;
+            cur.score     = multi_windows.at(i).score;
+
+            // Store window
+            multi_windows.at(num_detections) = cur;
+            ++num_detections;
+
+            const float xc = cur.x + cur.width * 0.5f;
+            const float yc = cur.y + cur.height * 0.5f;
+
+            for(size_t k = i + 1; k < (num_candidates) && (cur.idx_class == multi_windows.at(k).idx_class); ++k)
+            {
+                const float xn = multi_windows.at(k).x + multi_windows.at(k).width * 0.5f;
+                const float yn = multi_windows.at(k).y + multi_windows.at(k).height * 0.5f;
+
+                const float dx = std::fabs(xn - xc);
+                const float dy = std::fabs(yn - yc);
+
+                if(dx < min_distance && dy < min_distance)
+                {
+                    const float d = dx * dx + dy * dy;
+
+                    if(d < min_distance_pow2)
+                    {
+                        // Invalidate detection window
+                        multi_windows.at(k).score = 0.0f;
+                    }
+                }
+            }
+        }
+    }
+
+    multi_windows.resize(num_detections);
+}
+
+template <typename T>
+std::vector<DetectionWindow> hog_multi_detection(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value,
+                                                 const std::vector<HOGInfo> &models, std::vector<std::vector<float>> descriptors,
+                                                 unsigned int max_num_detection_windows, float threshold, bool non_maxima_suppression, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON(descriptors.size() != models.size());
+    validate_models(models);
+
+    const size_t width      = src.shape().x();
+    const size_t height     = src.shape().y();
+    const size_t num_models = models.size();
+
+    // Initialize previous values
+    size_t prev_num_bins     = models[0].num_bins();
+    Size2D prev_cell_size    = models[0].cell_size();
+    Size2D prev_block_size   = models[0].block_size();
+    Size2D prev_block_stride = models[0].block_stride();
+
+    std::vector<size_t> input_orient_bin;
+    std::vector<size_t> input_hog_detect;
+    std::vector<std::pair<size_t, size_t>> input_block_norm;
+
+    input_orient_bin.push_back(0);
+    input_hog_detect.push_back(0);
+    input_block_norm.emplace_back(0, 0);
+
+    // Iterate through the number of models and check if orientation binning
+    // and block normalization steps can be skipped
+    for(size_t i = 1; i < num_models; ++i)
+    {
+        size_t cur_num_bins     = models[i].num_bins();
+        Size2D cur_cell_size    = models[i].cell_size();
+        Size2D cur_block_size   = models[i].block_size();
+        Size2D cur_block_stride = models[i].block_stride();
+
+        // Check if binning and normalization steps are required
+        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
+        {
+            prev_num_bins     = cur_num_bins;
+            prev_cell_size    = cur_cell_size;
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute orientation binning and block normalization. Update input to process
+            input_orient_bin.push_back(i);
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
+                || (cur_block_stride.height != prev_block_stride.height))
+        {
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute block normalization. Update input to process
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+
+        // Update input to process for hog detector
+        input_hog_detect.push_back(input_block_norm.size() - 1);
+    }
+
+    size_t num_orient_bin = input_orient_bin.size();
+    size_t num_block_norm = input_block_norm.size();
+    size_t num_hog_detect = input_hog_detect.size();
+
+    std::vector<SimpleTensor<float>> hog_spaces(num_orient_bin);
+    std::vector<SimpleTensor<float>> hog_norm_spaces(num_block_norm);
+
+    // Calculate derivative
+    SimpleTensor<int16_t> grad_x;
+    SimpleTensor<int16_t> grad_y;
+    std::tie(grad_x, grad_y) = derivative<int16_t>(src, border_mode, constant_border_value, GradientDimension::GRAD_XY);
+
+    // Calculate magnitude and phase
+    SimpleTensor<int16_t> _mag   = magnitude(grad_x, grad_y, MagnitudeType::L2NORM);
+    SimpleTensor<uint8_t> _phase = phase(grad_x, grad_y, models[0].phase_type());
+
+    // Calculate Tensors for the HOG space and orientation binning
+    for(size_t i = 0; i < num_orient_bin; ++i)
+    {
+        const size_t idx_multi_hog = input_orient_bin[i];
+
+        const size_t num_bins    = models[idx_multi_hog].num_bins();
+        const size_t num_cells_x = width / models[idx_multi_hog].cell_size().width;
+        const size_t num_cells_y = height / models[idx_multi_hog].cell_size().height;
+
+        // TensorShape of hog space
+        TensorShape hog_space_shape(num_cells_x, num_cells_y);
+
+        // Initialise HOG space
+        TensorInfo info_hog_space(hog_space_shape, num_bins, DataType::F32);
+        hog_spaces.at(i) = SimpleTensor<float>(info_hog_space.tensor_shape(), DataType::F32, info_hog_space.num_channels());
+
+        // For each cell create histogram based on magnitude and phase
+        hog_orientation_binning(_mag, _phase, hog_spaces[i], models[idx_multi_hog]);
+    }
+
+    // Calculate Tensors for the normalized HOG space and block normalization
+    for(size_t i = 0; i < num_block_norm; ++i)
+    {
+        const size_t idx_multi_hog  = input_block_norm[i].first;
+        const size_t idx_orient_bin = input_block_norm[i].second;
+
+        // Create tensor info for HOG descriptor
+        TensorInfo tensor_info(models[idx_multi_hog], src.shape().x(), src.shape().y());
+        hog_norm_spaces.at(i) = SimpleTensor<float>(tensor_info.tensor_shape(), DataType::F32, tensor_info.num_channels());
+
+        // Normalize histograms based on block size
+        hog_block_normalization(hog_norm_spaces[i], hog_spaces[idx_orient_bin], models[idx_multi_hog]);
+    }
+
+    std::vector<DetectionWindow> multi_windows;
+
+    // Calculate Detection Windows for HOG detector
+    for(size_t i = 0; i < num_hog_detect; ++i)
+    {
+        const size_t idx_block_norm = input_hog_detect[i];
+
+        // NOTE: Detection window stride fixed to block stride
+        const Size2D detection_window_stride = models[i].block_stride();
+
+        std::vector<DetectionWindow> windows = hog_detector(hog_norm_spaces[idx_block_norm], descriptors[i],
+                                                            max_num_detection_windows, models[i], detection_window_stride, threshold, i);
+
+        multi_windows.insert(multi_windows.end(), windows.begin(), windows.end());
+    }
+
+    // Suppress Non-maxima detection windows
+    if(non_maxima_suppression)
+    {
+        detection_windows_non_maxima_suppression(multi_windows, min_distance);
+    }
+
+    return multi_windows;
+}
+
+template std::vector<DetectionWindow> hog_multi_detection(const SimpleTensor<uint8_t> &src, BorderMode border_mode, uint8_t constant_border_value,
+                                                          const std::vector<HOGInfo> &models, std::vector<std::vector<float>> descriptors,
+                                                          unsigned int max_num_detection_windows, float threshold, bool non_maxima_suppression, float min_distance);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/HOGMultiDetection.h b/tests/validation/reference/HOGMultiDetection.h
new file mode 100644
index 0000000..6d75bf4
--- /dev/null
+++ b/tests/validation/reference/HOGMultiDetection.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_H__
+#define __ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_H__
+
+#include "arm_compute/core/Types.h"
+#include "tests/SimpleTensor.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+std::vector<DetectionWindow> hog_multi_detection(const SimpleTensor<T> &src, BorderMode border_mode, T constant_border_value,
+                                                 const std::vector<HOGInfo> &models, std::vector<std::vector<float>> descriptors,
+                                                 unsigned int max_num_detection_windows, float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_HOG_MULTI_DETECTION_H__ */
diff --git a/tests/validation/reference/Im2Col.cpp b/tests/validation/reference/Im2Col.cpp
new file mode 100644
index 0000000..5685b60
--- /dev/null
+++ b/tests/validation/reference/Im2Col.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Im2Col.h"
+
+#include "Permute.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Utils.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+void im2col_nchw(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NCHW);
+    // Create reference
+    const int pad_x         = conv_info.pad().first;
+    const int pad_y         = conv_info.pad().second;
+    const int stride_x      = conv_info.stride().first;
+    const int stride_y      = conv_info.stride().second;
+    const int kernel_width  = kernel_dims.width;
+    const int kernel_height = kernel_dims.height;
+    const int src_width     = src.shape().x();
+    const int src_height    = src.shape().y();
+    const int src_depth     = src.shape().z();
+    const int batches       = src.shape().total_size_upper(3);
+    const int pad_val       = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0;
+
+    int dst_idx = 0;
+    for(int b = 0; b < batches; ++b)
+    {
+        for(int y = -pad_y; y <= (src_height + pad_y - kernel_height); y += stride_y)
+        {
+            for(int x = -pad_x; x <= (src_width + pad_x - kernel_width); x += stride_x)
+            {
+                for(int z = 0; z < src_depth; ++z)
+                {
+                    for(int patch_y = y; patch_y < (y + kernel_height); ++patch_y)
+                    {
+                        for(int patch_x = x; patch_x < (x + kernel_width); ++patch_x)
+                        {
+                            dst[dst_idx++] = tensor_elem_at(src, Coordinates(patch_x, patch_y, z, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
+                        }
+                    }
+                }
+
+                if(has_bias)
+                {
+                    dst[dst_idx++] = static_cast<T>(1);
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void im2col_nhwc(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NHWC);
+    const int pad_x         = conv_info.pad().first;
+    const int pad_y         = conv_info.pad().second;
+    const int stride_x      = conv_info.stride().first;
+    const int stride_y      = conv_info.stride().second;
+    const int kernel_width  = kernel_dims.width;
+    const int kernel_height = kernel_dims.height;
+    const int src_width     = src.shape().y();
+    const int src_height    = src.shape().z();
+    const int src_depth     = src.shape().x();
+    const int batches       = src.shape().total_size_upper(3);
+    const int pad_val       = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0;
+    int       dst_idx       = 0;
+    for(int b = 0; b < batches; ++b)
+    {
+        for(int y = -pad_y; y <= (src_height + pad_y - kernel_height); y += stride_y)
+        {
+            for(int x = -pad_x; x <= (src_width + pad_x - kernel_width); x += stride_x)
+            {
+                for(int z = 0; z < src_depth; ++z)
+                {
+                    for(int patch_y = y; patch_y < (y + kernel_height); ++patch_y)
+                    {
+                        for(int patch_x = x; patch_x < (x + kernel_width); ++patch_x)
+                        {
+                            dst[dst_idx++] = tensor_elem_at(src, Coordinates(z, patch_x, patch_y, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
+                        }
+                    }
+                }
+
+                if(has_bias)
+                {
+                    dst[dst_idx++] = static_cast<T>(1);
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    switch(src.data_layout())
+    {
+        case DataLayout::NCHW:
+        {
+            im2col_nchw(src, dst, kernel_dims, conv_info, has_bias);
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            im2col_nhwc(src, dst, kernel_dims, conv_info, has_bias);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+        }
+    }
+}
+
+template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/Im2Col.h b/tests/validation/reference/Im2Col.h
new file mode 100644
index 0000000..5277171
--- /dev/null
+++ b/tests/validation/reference/Im2Col.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_IM2COL_H__
+#define __ARM_COMPUTE_TEST_IM2COL_H__
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_IM2COL_H__ */
diff --git a/tests/validation/reference/LocallyConnected.cpp b/tests/validation/reference/LocallyConnected.cpp
new file mode 100644
index 0000000..08e3f02
--- /dev/null
+++ b/tests/validation/reference/LocallyConnected.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "LocallyConnected.h"
+
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Convolution3d.h"
+#include "tests/validation/reference/Utils.h"
+
+#include "tests/framework/Asserts.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename TB>
+SimpleTensor<T> locally_connected(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info)
+{
+    // Create reference
+    SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
+
+    // Compute reference
+    const int width_in  = src.shape().x();
+    const int height_in = src.shape().y();
+    const int depth_in  = src.shape().z();
+
+    const int width_out  = dst.shape().x();
+    const int height_out = dst.shape().y();
+    const int depth_out  = dst.shape().z();
+
+    const int width_weights  = weights.shape().x();
+    const int height_weights = weights.shape().y();
+    const int depth_weights  = weights.shape().z();
+
+    const int pad_left  = info.pad_left();
+    const int pad_top   = info.pad_top();
+    const int stride_xi = info.stride().first;
+    const int stride_yi = info.stride().second;
+
+    auto output_wh = scaled_dimensions(width_in, height_in, width_weights, height_weights, info);
+
+    const int start_xi    = width_weights / 2 - pad_left;
+    const int start_yi    = height_weights / 2 - pad_top;
+    const int end_xi      = output_wh.first * stride_xi;
+    const int end_yi      = output_wh.second * stride_yi;
+    const int num_batches = src.shape().total_size() / (width_in * height_in * depth_in);
+
+    for(int r = 0; r < num_batches; ++r)
+    {
+        int count = 0;
+        for(int yi = start_yi; yi < start_yi + end_yi; yi += stride_yi)
+        {
+            for(int xi = start_xi; xi < start_xi + end_xi; xi += stride_xi)
+            {
+                for(int ofm = 0; ofm < depth_out; ++ofm)
+                {
+                    // Compute input and output offsets
+                    const int offset_in  = r * width_in * height_in * depth_in;
+                    const int xo         = (xi - start_xi) / stride_xi;
+                    const int yo         = (yi - start_yi) / stride_yi;
+                    const int offset_out = xo + yo * width_out + ofm * width_out * height_out + r * width_out * height_out * depth_out;
+
+                    ARM_COMPUTE_ASSERT(xo < width_out);
+                    ARM_COMPUTE_ASSERT(yo < height_out);
+
+                    // Compute 3D convolution
+                    convolution_3d::detail::convolution3d(src, weights, bias, dst,
+                                                          offset_in, count * width_weights * height_weights * depth_weights, count, offset_out,
+                                                          xi, yi,
+                                                          width_in, height_in, depth_in,
+                                                          width_weights, height_weights);
+                    count++;
+                }
+            }
+        }
+    }
+
+    return dst;
+}
+
+// Locally Connected only supports F32
+template SimpleTensor<float> locally_connected(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
+                                               const PadStrideInfo &info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/LocallyConnected.h b/tests/validation/reference/LocallyConnected.h
new file mode 100644
index 0000000..bf78d2c
--- /dev/null
+++ b/tests/validation/reference/LocallyConnected.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_LOCALLY_CONNECTED_H__
+#define __ARM_COMPUTE_TEST_LOCALLY_CONNECTED_H__
+
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename TB>
+SimpleTensor<T> locally_connected(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_LOCALLY_CONNECTED_H__ */
diff --git a/tests/validation/reference/OpticalFlow.cpp b/tests/validation/reference/OpticalFlow.cpp
new file mode 100644
index 0000000..da0b9f9
--- /dev/null
+++ b/tests/validation/reference/OpticalFlow.cpp
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "OpticalFlow.h"
+
+#include "GaussianPyramidHalf.h"
+#include "Scharr.h"
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+using KeyPointArray         = std::vector<KeyPoint>;
+using InternalKeyPointArray = std::vector<InternalKeyPoint>;
+
+// Constants used for Lucas-Kanade Algorithm
+constexpr int   W_BITS                = 14;
+constexpr float D0                    = 1 << W_BITS;
+constexpr float DETERMINANT_THRESHOLD = 1.0e-07f;
+constexpr float EIGENVALUE_THRESHOLD  = 1.0e-04f;
+constexpr float FLT_SCALE             = 1.0f / (1 << 20);
+
+// Creates an InternalKeyPointArray for tracking non-integral pixel coordinates
+InternalKeyPointArray create_internal_keypoints(const KeyPointArray &keypoints)
+{
+    InternalKeyPointArray internal_keypoints;
+
+    for(auto keypoint : keypoints)
+    {
+        InternalKeyPoint internal_keypoint;
+
+        internal_keypoint.x               = static_cast<float>(keypoint.x);
+        internal_keypoint.y               = static_cast<float>(keypoint.y);
+        internal_keypoint.tracking_status = static_cast<bool>(keypoint.tracking_status);
+
+        internal_keypoints.push_back(internal_keypoint);
+    }
+
+    return internal_keypoints;
+}
+
+// Scale tracked points based on Pyramid level
+void scale_tracked_points(size_t level, size_t num_levels, bool use_initial_estimate,
+                          InternalKeyPointArray &old_points_internal, InternalKeyPointArray &new_points_internal,
+                          const KeyPointArray &old_points, const KeyPointArray &new_points_estimates)
+{
+    if(level == num_levels - 1) // lowest resolution
+    {
+        const float scale = std::pow(SCALE_PYRAMID_HALF, level);
+
+        for(size_t i = 0; i < old_points.size(); ++i)
+        {
+            old_points_internal.at(i).x               = old_points.at(i).x * scale;
+            old_points_internal.at(i).y               = old_points.at(i).y * scale;
+            old_points_internal.at(i).tracking_status = true;
+
+            InternalKeyPoint keypoint_to_track;
+
+            if(use_initial_estimate)
+            {
+                keypoint_to_track.x               = new_points_estimates.at(i).x * scale;
+                keypoint_to_track.y               = new_points_estimates.at(i).y * scale;
+                keypoint_to_track.tracking_status = (new_points_estimates.at(i).tracking_status == 1);
+            }
+            else
+            {
+                keypoint_to_track.x               = old_points_internal.at(i).x;
+                keypoint_to_track.y               = old_points_internal.at(i).y;
+                keypoint_to_track.tracking_status = true;
+            }
+
+            new_points_internal.at(i) = keypoint_to_track;
+        }
+    }
+    else
+    {
+        for(size_t i = 0; i < old_points.size(); ++i)
+        {
+            old_points_internal.at(i).x /= SCALE_PYRAMID_HALF;
+            old_points_internal.at(i).y /= SCALE_PYRAMID_HALF;
+            new_points_internal.at(i).x /= SCALE_PYRAMID_HALF;
+            new_points_internal.at(i).y /= SCALE_PYRAMID_HALF;
+        }
+    }
+}
+
+bool is_invalid_keypoint(const InternalKeyPoint &keypoint, const ValidRegion &valid_region, size_t window_dimension)
+{
+    const int half_window = window_dimension / 2;
+    const int x           = std::floor(keypoint.x);
+    const int y           = std::floor(keypoint.y);
+
+    return (x - half_window < valid_region.start(0)) || (x + half_window >= valid_region.end(0) - 1) || (y - half_window < valid_region.start(1)) || (y + half_window >= valid_region.end(1) - 1);
+}
+
+template <typename T>
+constexpr int INT_ROUND(T x, int n)
+{
+    return (x + (1 << (n - 1))) >> n;
+}
+
+// Return the bilinear value at a specified coordinate with different border modes
+template <typename T>
+int bilinear_interpolate(const SimpleTensor<T> &in, Coordinates id, float wx, float wy, BorderMode border_mode, T constant_border_value, int scale)
+{
+    const int level = id.x();
+    const int idy   = id.y();
+
+    const float dx   = wx;
+    const float dy   = wy;
+    const float dx_1 = 1.0f - dx;
+    const float dy_1 = 1.0f - dy;
+
+    const T border_value = constant_border_value;
+
+    id.set(0, level);
+    id.set(1, idy);
+    const T tl = tensor_elem_at(in, id, border_mode, border_value);
+    id.set(0, level + 1);
+    id.set(1, idy);
+    const T tr = tensor_elem_at(in, id, border_mode, border_value);
+    id.set(0, level);
+    id.set(1, idy + 1);
+    const T bl = tensor_elem_at(in, id, border_mode, border_value);
+    id.set(0, level + 1);
+    id.set(1, idy + 1);
+    const T br = tensor_elem_at(in, id, border_mode, border_value);
+
+    // weights
+    const int w00 = roundf(dx_1 * dy_1 * D0);
+    const int w01 = roundf(dx * dy_1 * D0);
+    const int w10 = roundf(dx_1 * dy * D0);
+    const int w11 = D0 - w00 - w01 - w10;
+
+    return static_cast<int>(INT_ROUND(tl * w00 + tr * w01 + bl * w10 + br * w11, scale));
+}
+
+template <typename T>
+std::vector<int> compute_derivative(const SimpleTensor<T> &input, const InternalKeyPoint &keypoint,
+                                    BorderMode border_mode, uint8_t constant_border_value, size_t window_dimension, int scale)
+{
+    std::vector<int> bilinear_values;
+
+    const int half_window = window_dimension / 2;
+
+    float keypoint_int_x = 0;
+    float keypoint_int_y = 0;
+
+    const float wx = std::modf(keypoint.x, &keypoint_int_x);
+    const float wy = std::modf(keypoint.y, &keypoint_int_y);
+
+    Coordinates tl_window(static_cast<int>(keypoint_int_x) - half_window, static_cast<int>(keypoint_int_y) - half_window);
+    Coordinates br_window(static_cast<int>(keypoint_int_x) + half_window, static_cast<int>(keypoint_int_y) + half_window);
+
+    for(int y = tl_window.y(); y <= br_window.y(); ++y)
+    {
+        for(int x = tl_window.x(); x <= br_window.x(); ++x)
+        {
+            bilinear_values.push_back(bilinear_interpolate(input, Coordinates(x, y), wx, wy, border_mode, static_cast<T>(constant_border_value), scale));
+        }
+    }
+
+    return bilinear_values;
+}
+
+std::tuple<float, float, float> compute_spatial_gradient_matrix(const std::vector<int> &bilinear_ix, const std::vector<int> &bilinear_iy)
+{
+    ARM_COMPUTE_ERROR_ON(bilinear_ix.size() != bilinear_iy.size());
+
+    int iA11 = 0;
+    int iA12 = 0;
+    int iA22 = 0;
+
+    for(size_t i = 0; i < bilinear_ix.size(); ++i)
+    {
+        int ixval = bilinear_ix[i];
+        int iyval = bilinear_iy[i];
+
+        iA11 += ixval * ixval;
+        iA12 += ixval * iyval;
+        iA22 += iyval * iyval;
+    }
+
+    return std::make_tuple(iA11 * FLT_SCALE, iA12 * FLT_SCALE, iA22 * FLT_SCALE);
+}
+
+std::tuple<double, double> compute_temporal_gradient_vector(const std::vector<int> &bilinear_it_old,
+                                                            const std::vector<int> &bilinear_it_new,
+                                                            const std::vector<int> &bilinear_ix,
+                                                            const std::vector<int> &bilinear_iy)
+{
+    ARM_COMPUTE_ERROR_ON(bilinear_ix.size() != bilinear_iy.size());
+    ARM_COMPUTE_ERROR_ON(bilinear_it_old.size() != bilinear_it_new.size());
+
+    int ib1 = 0;
+    int ib2 = 0;
+
+    for(size_t i = 0; i < bilinear_ix.size(); ++i)
+    {
+        int ixval = bilinear_ix[i];
+        int iyval = bilinear_iy[i];
+        int ival  = bilinear_it_old[i];
+        int jval  = bilinear_it_new[i];
+
+        const int diff = jval - ival;
+
+        ib1 += diff * ixval;
+        ib2 += diff * iyval;
+    }
+
+    const double b1 = ib1 * FLT_SCALE;
+    const double b2 = ib2 * FLT_SCALE;
+
+    return std::make_tuple(b1, b2);
+}
+} // namespace
+
+template <typename T>
+std::vector<KeyPoint> optical_flow(const SimpleTensor<T> &old_input, const SimpleTensor<T> &new_input,
+                                   const OpticalFlowParameters &params, size_t num_levels,
+                                   const std::vector<KeyPoint> &old_points, const std::vector<KeyPoint> &new_points_estimates,
+                                   BorderMode border_mode, uint8_t constant_border_value)
+{
+    const int    filter_size      = 3;    // scharr filter size
+    const size_t max_iterations   = 1000; // fixed by kernel
+    const size_t window_dimension = params.window_dimension;
+    const size_t num_iterations   = (params.termination == Termination::TERM_CRITERIA_EPSILON) ? max_iterations : params.num_iterations;
+
+    KeyPointArray new_points(old_points.size());
+
+    InternalKeyPointArray old_points_internal = create_internal_keypoints(old_points);
+    InternalKeyPointArray new_points_internal = create_internal_keypoints(new_points_estimates);
+
+    SimpleTensor<int16_t> scharr_gx;
+    SimpleTensor<int16_t> scharr_gy;
+
+    // Create pyramids
+    std::vector<SimpleTensor<T>> old_pyramid = gaussian_pyramid_half(old_input, border_mode, constant_border_value, num_levels);
+    std::vector<SimpleTensor<T>> new_pyramid = gaussian_pyramid_half(new_input, border_mode, constant_border_value, num_levels);
+
+    // Iterate over each level of the pyramid
+    for(size_t idx = num_levels; idx > 0; --idx)
+    {
+        const size_t level = idx - 1;
+
+        // Calculate scharr gradients
+        std::tie(scharr_gx, scharr_gy) = scharr<int16_t, T>(old_pyramid[level], filter_size, border_mode, constant_border_value, GradientDimension::GRAD_XY);
+
+        scale_tracked_points(level, num_levels, params.use_initial_estimate, old_points_internal, new_points_internal, old_points, new_points_estimates);
+
+        // Calculate valid region based on image dimensions of current pyramid level
+        const ValidRegion valid_region = shape_to_valid_region(old_pyramid[level].shape(), (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
+
+        for(size_t i = 0; i < old_points.size(); ++i)
+        {
+            InternalKeyPoint &old_keypoint = old_points_internal.at(i);
+            InternalKeyPoint &new_keypoint = new_points_internal.at(i);
+
+            // Helper function for untracking keypoints when on the lowest pyramid level (high resolution)
+            const auto untrack_keypoint = [&](bool predicate)
+            {
+                if(predicate && (level == 0))
+                {
+                    new_keypoint.tracking_status = false;
+                    return true;
+                }
+                return predicate;
+            };
+
+            if(!old_keypoint.tracking_status)
+            {
+                continue;
+            }
+
+            // Check if tracked coordinate is outside image coordinate
+            if(untrack_keypoint(is_invalid_keypoint(old_keypoint, valid_region, window_dimension)))
+            {
+                continue;
+            }
+
+            // Compute spatial derivative
+            std::vector<int> bilinear_ix = compute_derivative(scharr_gx, old_keypoint, border_mode, constant_border_value, window_dimension, W_BITS);
+            std::vector<int> bilinear_iy = compute_derivative(scharr_gy, old_keypoint, border_mode, constant_border_value, window_dimension, W_BITS);
+
+            float A11 = 0.f;
+            float A12 = 0.f;
+            float A22 = 0.f;
+            std::tie(A11, A12, A22) = compute_spatial_gradient_matrix(bilinear_ix, bilinear_iy);
+
+            // Calculate criteria for lost tracking : Matrix A is invertible
+            // 1. The determinant of the matrix is less than DETERMINANT_THRESHOLD
+            // 2. The minimum eigenvalue of the matrix is less than EIGENVALUE_THRESHOLD
+            const float trace_A      = A11 + A22;
+            const float determinant  = A11 * A22 - A12 * A12;
+            const float discriminant = (trace_A * trace_A) - 4.0f * (determinant);
+            const float eigenvalue_A = (trace_A - std::sqrt(discriminant)) / 2.0f;
+
+            // Divide by window_dimension squared to reduce the floating point accummulation error
+            const float eigenvalue = eigenvalue_A / (window_dimension * window_dimension);
+
+            // Check if it is a good point to track
+            if(untrack_keypoint(eigenvalue < EIGENVALUE_THRESHOLD || determinant < DETERMINANT_THRESHOLD))
+            {
+                continue;
+            }
+
+            float prev_delta_x = 0.f;
+            float prev_delta_y = 0.f;
+
+            for(size_t j = 0; j < num_iterations; ++j)
+            {
+                // Check if tracked coordinate is outside image coordinate
+                if(untrack_keypoint(is_invalid_keypoint(new_keypoint, valid_region, window_dimension)))
+                {
+                    break;
+                }
+
+                // Compute temporal derivative
+                std::vector<int> bilinear_it_old = compute_derivative(old_pyramid[level], old_keypoint, border_mode, constant_border_value, window_dimension, W_BITS - 5);
+                std::vector<int> bilinear_it_new = compute_derivative(new_pyramid[level], new_keypoint, border_mode, constant_border_value, window_dimension, W_BITS - 5);
+
+                double b1 = 0.f;
+                double b2 = 0.f;
+                std::tie(b1, b2) = compute_temporal_gradient_vector(bilinear_it_old, bilinear_it_new, bilinear_ix, bilinear_iy);
+
+                // Compute motion vector -> A^-1 * -b
+                const float delta_x = (A12 * b2 - A22 * b1) / determinant;
+                const float delta_y = (A12 * b1 - A11 * b2) / determinant;
+
+                // Update the new position
+                new_keypoint.x += delta_x;
+                new_keypoint.y += delta_y;
+
+                const float magnitude_squared = delta_x * delta_x + delta_y * delta_y;
+
+                // Check if termination criteria is EPSILON and if it is satisfied
+                if(magnitude_squared <= params.epsilon && (params.termination == Termination::TERM_CRITERIA_EPSILON || params.termination == Termination::TERM_CRITERIA_BOTH))
+                {
+                    break;
+                }
+
+                // Check convergence analyzing the previous delta
+                if(j > 0 && (std::fabs(delta_x + prev_delta_x) < 0.01f && std::fabs(delta_y + prev_delta_y) < 0.01f))
+                {
+                    new_keypoint.x -= delta_x * SCALE_PYRAMID_HALF;
+                    new_keypoint.y -= delta_y * SCALE_PYRAMID_HALF;
+
+                    break;
+                }
+
+                prev_delta_x = delta_x;
+                prev_delta_y = delta_y;
+            }
+        }
+    }
+
+    // Copy optical flow coordinates to output vector
+    for(size_t i = 0; i < old_points.size(); ++i)
+    {
+        const InternalKeyPoint &new_keypoint = new_points_internal.at(i);
+
+        new_points.at(i).x               = roundf(new_keypoint.x);
+        new_points.at(i).y               = roundf(new_keypoint.y);
+        new_points.at(i).tracking_status = new_keypoint.tracking_status ? 1 : 0;
+    }
+
+    return new_points;
+}
+
+template std::vector<KeyPoint> optical_flow(const SimpleTensor<uint8_t> &old_input, const SimpleTensor<uint8_t> &new_input,
+                                            const OpticalFlowParameters &params, size_t num_levels,
+                                            const std::vector<KeyPoint> &old_points, const std::vector<KeyPoint> &new_points_estimates,
+                                            BorderMode border_mode, uint8_t constant_border_value);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/OpticalFlow.h b/tests/validation/reference/OpticalFlow.h
new file mode 100644
index 0000000..ad6e2a9
--- /dev/null
+++ b/tests/validation/reference/OpticalFlow.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_OPTICAL_FLOW_H__
+#define __ARM_COMPUTE_TEST_OPTICAL_FLOW_H__
+
+#include "tests/SimpleTensor.h"
+#include "tests/Types.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+std::vector<KeyPoint> optical_flow(const SimpleTensor<T> &old_input, const SimpleTensor<T> &new_input,
+                                   const OpticalFlowParameters &params, size_t num_levels,
+                                   const std::vector<KeyPoint> &old_points, const std::vector<KeyPoint> &new_points_estimates,
+                                   BorderMode border_mode, uint8_t constant_border_value);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_OPTICAL_FLOW_H__ */
diff --git a/tests/validation/reference/Permute.cpp b/tests/validation/reference/Permute.cpp
index 4a12ca6..bbb2e8d 100644
--- a/tests/validation/reference/Permute.cpp
+++ b/tests/validation/reference/Permute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,7 @@
     permute(dst_shape, perm);
 
     // Create reference
-    SimpleTensor<T> dst{ dst_shape, src.data_type() };
+    SimpleTensor<T> dst{ dst_shape, src.data_type(), src.num_channels(), src.fixed_point_position(), src.quantization_info() };
 
     // Compute reference
     for(int i = 0; i < src.num_elements(); ++i)
@@ -57,9 +57,13 @@
     return dst;
 }
 
+template SimpleTensor<int8_t> permute(const SimpleTensor<int8_t> &src, PermutationVector perm);
 template SimpleTensor<uint8_t> permute(const SimpleTensor<uint8_t> &src, PermutationVector perm);
+template SimpleTensor<int16_t> permute(const SimpleTensor<int16_t> &src, PermutationVector perm);
 template SimpleTensor<uint16_t> permute(const SimpleTensor<uint16_t> &src, PermutationVector perm);
 template SimpleTensor<uint32_t> permute(const SimpleTensor<uint32_t> &src, PermutationVector perm);
+template SimpleTensor<float> permute(const SimpleTensor<float> &src, PermutationVector perm);
+template SimpleTensor<half> permute(const SimpleTensor<half> &src, PermutationVector perm);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index c14ab98..6973454 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -24,6 +24,7 @@
 #include "PoolingLayer.h"
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
@@ -35,30 +36,16 @@
 {
 namespace reference
 {
-namespace
-{
-TensorShape calculate_output_shape(TensorShape shape, const PoolingLayerInfo &info)
-{
-    TensorShape dst_shape   = shape;
-    const int   pool_size_x = info.is_global_pooling() ? shape.x() : info.pool_size().width;
-    const int   pool_size_y = info.is_global_pooling() ? shape.y() : info.pool_size().height;
-    const std::pair<unsigned int, unsigned int> scaled_dims = arm_compute::scaled_dimensions(shape.x(),
-                                                                                             shape.y(),
-                                                                                             pool_size_x,
-                                                                                             pool_size_y,
-                                                                                             info.pad_stride_info());
-    dst_shape.set(0, scaled_dims.first);
-    dst_shape.set(1, scaled_dims.second);
-
-    return dst_shape;
-}
-} // namespace
+using namespace arm_compute::misc::shape_calculator;
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
 SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON(info.is_global_pooling() && (src.shape().x() != src.shape().y()));
 
+    // Create reference
+    SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type(), src.fixed_point_position()), info), src.data_type(), 1, src.fixed_point_position() };
+
     const int   pool_size_x     = info.is_global_pooling() ? src.shape().x() : info.pool_size().width;
     const int   pool_size_y     = info.is_global_pooling() ? src.shape().y() : info.pool_size().height;
     PoolingType type            = info.pool_type();
@@ -74,9 +61,6 @@
     const auto h_src      = static_cast<int>(src.shape()[1]);
     const int  upper_dims = src.shape().total_size() / (w_src * h_src);
 
-    // Create reference
-    SimpleTensor<T> dst{ calculate_output_shape(src.shape(), info), src.data_type(), 1, src.fixed_point_position() };
-
     const auto w_dst = static_cast<int>(dst.shape()[0]);
     const auto h_dst = static_cast<int>(dst.shape()[1]);
 
@@ -173,6 +157,10 @@
 {
     ARM_COMPUTE_ERROR_ON(info.is_global_pooling() && (src.shape().x() != src.shape().y()));
 
+    const auto w_src      = static_cast<int>(src.shape()[0]);
+    const auto h_src      = static_cast<int>(src.shape()[1]);
+    const int  upper_dims = src.shape().total_size() / (w_src * h_src);
+
     const int   pool_size_x     = info.is_global_pooling() ? src.shape().x() : info.pool_size().width;
     const int   pool_size_y     = info.is_global_pooling() ? src.shape().y() : info.pool_size().height;
     PoolingType type            = info.pool_type();
@@ -184,12 +172,8 @@
     int         pad_bottom      = info.pad_stride_info().pad_bottom();
     bool        exclude_padding = info.exclude_padding();
 
-    const auto w_src      = static_cast<int>(src.shape()[0]);
-    const auto h_src      = static_cast<int>(src.shape()[1]);
-    const int  upper_dims = src.shape().total_size() / (w_src * h_src);
-
     // Create reference
-    SimpleTensor<T> dst{ calculate_output_shape(src.shape(), info), src.data_type(), 1, src.fixed_point_position() };
+    SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type(), src.fixed_point_position()), info), src.data_type(), 1, src.fixed_point_position() };
 
     const auto w_dst = static_cast<int>(dst.shape()[0]);
     const auto h_dst = static_cast<int>(dst.shape()[1]);
diff --git a/tests/validation/reference/Scale.cpp b/tests/validation/reference/Scale.cpp
index 0cc96ab..f8a8b88 100644
--- a/tests/validation/reference/Scale.cpp
+++ b/tests/validation/reference/Scale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,9 @@
  */
 
 #include "Scale.h"
+
 #include "Utils.h"
-#include "arm_compute/core/utils/misc/utility.h"
+#include "arm_compute/core/utils/misc/Utility.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
diff --git a/tests/validation/reference/Utils.h b/tests/validation/reference/Utils.h
index 2aa77c6..0e98bbe 100644
--- a/tests/validation/reference/Utils.h
+++ b/tests/validation/reference/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,11 +62,13 @@
 {
     const int x      = coord.x();
     const int y      = coord.y();
+    const int z      = coord.z();
     const int width  = src.shape().x();
     const int height = src.shape().y();
+    const int depth  = src.shape().z();
 
     // If coordinates beyond range of tensor's width or height
-    if(x < 0 || y < 0 || x >= width || y >= height)
+    if(x < 0 || y < 0 || z < 0 || x >= width || y >= height || z >= depth)
     {
         if(border_mode == BorderMode::REPLICATE)
         {
diff --git a/tests/validation/reference/WidthConcatenateLayer.cpp b/tests/validation/reference/WidthConcatenateLayer.cpp
new file mode 100644
index 0000000..fe79b4a
--- /dev/null
+++ b/tests/validation/reference/WidthConcatenateLayer.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "WidthConcatenateLayer.h"
+
+#include "tests/validation/FixedPoint.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> widthconcatenate_layer(const std::vector<SimpleTensor<T>> &srcs)
+{
+    // Create reference
+    std::vector<TensorShape> shapes;
+
+    for(const auto &src : srcs)
+    {
+        shapes.emplace_back(src.shape());
+    }
+
+    DataType        dst_type  = srcs.empty() ? DataType::UNKNOWN : srcs[0].data_type();
+    TensorShape     dst_shape = calculate_width_concatenate_shape(shapes);
+    SimpleTensor<T> dst(dst_shape, dst_type);
+
+    // Compute reference
+    int       width_offset = 0;
+    const int width_out    = dst.shape().x();
+
+    // Set output tensor to 0
+    std::fill_n(dst.data(), dst.num_elements(), 0);
+
+    for(const auto &src : srcs)
+    {
+        ARM_COMPUTE_ERROR_ON(width_offset >= width_out);
+
+        const int width  = src.shape().x();
+        const int height = src.shape().y();
+        const int depth  = src.shape().z();
+
+        const T *src_ptr = src.data();
+        T       *dst_ptr = dst.data();
+
+        for(int d = 0; d < depth; ++d)
+        {
+            for(int r = 0; r < height; ++r)
+            {
+                int offset = d * height + r;
+                std::copy(src_ptr, src_ptr + width, dst_ptr + width_offset + offset * width_out);
+                src_ptr += width;
+            }
+        }
+
+        width_offset += width;
+    }
+
+    return dst;
+}
+
+template SimpleTensor<float> widthconcatenate_layer(const std::vector<SimpleTensor<float>> &srcs);
+template SimpleTensor<half> widthconcatenate_layer(const std::vector<SimpleTensor<half>> &srcs);
+template SimpleTensor<qint8_t> widthconcatenate_layer(const std::vector<SimpleTensor<qint8_t>> &srcs);
+template SimpleTensor<qint16_t> widthconcatenate_layer(const std::vector<SimpleTensor<qint16_t>> &srcs);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/WidthConcatenateLayer.h b/tests/validation/reference/WidthConcatenateLayer.h
new file mode 100644
index 0000000..237e72b
--- /dev/null
+++ b/tests/validation/reference/WidthConcatenateLayer.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_WIDTHCONCATENATE_LAYER_H__
+#define __ARM_COMPUTE_TEST_WIDTHCONCATENATE_LAYER_H__
+
+#include "tests/SimpleTensor.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> widthconcatenate_layer(const std::vector<SimpleTensor<T>> &srcs);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_WIDTHCONCATENATE_LAYER_H__ */
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
new file mode 100644
index 0000000..194a78e
--- /dev/null
+++ b/tests/validation/reference/Winograd.cpp
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Winograd.h"
+
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Utils.h"
+
+#include "arm_compute/core/Types.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+template <typename T>
+void initialize_matrix_transform(SimpleTensor<T> &src, const Size2D &output_tile_size, const Size2D &kernel_size, WinogradTransformType winograd_transform_type)
+{
+    // Winograd input transform matrices
+    static const float imatrix2x2_3x3[] =
+    {
+        1.0f, 0.0f, -1.0f, 0.0f,
+        0.0f, 1.0f, 1.0f, 0.0f,
+        0.0f, -1.0f, 1.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, -1.0f
+    };
+
+    static const float imatrix4x4_3x3[] =
+    {
+        4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f,
+        0.0f, -4.0f, -4.0f, 1.0f, 1.0f, 0.0f,
+        0.0f, 4.0f, -4.0f, -1.0f, 1.0f, 0.0f,
+        0.0f, -2.0f, -1.0f, 2.0f, 1.0f, 0.0f,
+        0.0f, 2.0f, -1.0f, -2.0f, 1.0f, 0.0f,
+        0.0f, 4.0f, 0.0f, -5.0f, 0.0f, 1.0f,
+    };
+
+    static const float imatrix4x4_5x5[] =
+    {
+        1.f, 0.f, -21.f / 4.f, 0.f, 21.f / 4.f, 0.f, -1.f, 0.f,
+        0.f, 1.f, 1.f, -17.f / 4.f, -17.f / 4.f, 1.f, 1.f, 0.f,
+        0.f, -1.f, 1.f, 17.f / 4.f, -17.f / 4.f, -1.f, 1.f, 0.f,
+        0.f, 1.f / 2.f, 1.f / 4.f, -5.f / 2.f, -5.f / 4.f, 2.f, 1.f, 0.f,
+        0.f, -1.f / 2.f, 1.f / 4.f, 5.f / 2.f, -5.f / 4.f, -2.f, 1.f, 0.f,
+        0.f, 2.f, 4.f, -5.f / 2.f, -5.f, 1.f / 2.f, 1.f, 0.f,
+        0.f, -2.f, 4.f, 5.f / 2.f, -5.f, -1.f / 2.f, 1.f, 0.f,
+        0.f, -1.f, 0.f, 21.f / 4.f, 0.f, -21.f / 4.f, 0.f, 1.f
+    };
+
+    // ------------------------------------------
+
+    // Winograd filter transform matrices
+    static const float fmatrix2x2_3x3[] =
+    {
+        1.0f, 0.0f, 0.0f,
+        0.5f, 0.5f, 0.5f,
+        0.5f, -0.5f, 0.5f,
+        0.0f, 0.0f, 1.0f
+    };
+
+    static const float fmatrix4x4_3x3[] =
+    {
+        0.25f, 0.0f, 0.0f,
+        -1.0f / 6.0f, -1.0f / 6.0f, -1.0f / 6.0f,
+        -1.0f / 6.0f, 1.0f / 6.0f, -1.0f / 6.0f,
+        1.0f / 24.0f, 1.0f / 12.0f, 1.0f / 6.0f,
+        1.0f / 24.0f, -1.0f / 12.0f, 1.0f / 6.0f,
+        0.0f, 0.0f, 1.0f
+    };
+
+    static const float fmatrix4x4_5x5[] =
+    {
+        1.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        -2.0f / 9.0f, -2.0f / 9.0f, -2.0f / 9.0f, -2.0f / 9.0f, -2.0f / 9.0f,
+        -2.0f / 9.0f, 2.0f / 9.0f, -2.0f / 9.0f, 2.0f / 9.0f, -2.0f / 9.0f,
+        1.0f / 90.0f, 1.0f / 45.0f, 2.0f / 45.0f, 4.0f / 45.0f, 8.0f / 45.0f,
+        1.0f / 90.0f, -1.0f / 45.0f, 2.0f / 45.0f, -4.0f / 45.0f, 8.0f / 45.0f,
+        4.0f / 45.0f, 2.0f / 45.0f, 1.0f / 45.0f, 1.0f / 90.0f, 1.0f / 180.0f,
+        4.0f / 45.0f, -2.0f / 45.0f, 1.0f / 45.0f, -1.0f / 90.0f, 1.0f / 180.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 1.0f
+
+    };
+
+    // ------------------------------------------
+
+    // Winograd output transform matrices
+    static const float omatrix2x2_3x3[] =
+    {
+        1.0f, 1.0f, 1.0f, 0.0f,
+        0.0f, 1.0f, -1.0f, -1.0f
+    };
+
+    static const float omatrix4x4_3x3[] =
+    {
+        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f,
+        0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f,
+        0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f,
+        0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f
+    };
+
+    static const float omatrix4x4_5x5[] =
+    {
+        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 8.0f, 8.0f, 0.0f,
+        0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 4.0f, -4.0f, 0.0f,
+        0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 2.0f, 2.0f, 0.0f,
+        0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f, -1.0f, 1.0f
+    };
+
+    // ------------------------------------------
+
+    using WinogradKey = std::tuple<std::pair<int, int>, std::pair<int, int>, WinogradTransformType>;
+
+    // Key = (Output tile size, Kernel size, Winograd transform type)
+    static std::map<WinogradKey, const float *> matrix_map =
+    {
+        { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3), WinogradTransformType::INPUT), imatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3), WinogradTransformType::INPUT), imatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::INPUT), imatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3), WinogradTransformType::FILTER), fmatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3), WinogradTransformType::FILTER), fmatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::FILTER), fmatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3), WinogradTransformType::OUTPUT), omatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3), WinogradTransformType::OUTPUT), omatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::OUTPUT), omatrix4x4_5x5 },
+    };
+
+    // Find transformation matrix
+    std::map<WinogradKey, const float *>::iterator it;
+
+    it = matrix_map.find(WinogradKey(std::pair<int, int>(output_tile_size.width, output_tile_size.height),
+                                     std::pair<int, int>(kernel_size.width, kernel_size.height),
+                                     winograd_transform_type));
+
+    float const *matrix_values = nullptr;
+    if(it != matrix_map.end())
+    {
+        // Get matrix pointer
+        matrix_values = it->second;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Winograd configuration not supported");
+    }
+
+    // Copy values
+    std::copy(&matrix_values[0], &matrix_values[0] + src.num_elements(), &src[0]);
+}
+} // namespace
+
+template <typename T>
+SimpleTensor<T> winograd_input_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_ERROR_ON(in.data_layout() != DataLayout::NCHW);
+
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+
+    SimpleTensor<T> out{ output_shape, in.data_type() };
+
+    // Calculate dimensions for the tile
+    const unsigned int tile_w = output_tile_size.width + kernel_size.width - 1;
+    const unsigned int tile_h = output_tile_size.height + kernel_size.height - 1;
+
+    TensorShape tile_dims(tile_w, tile_h);
+
+    // Simple tensor for the input tile
+    SimpleTensor<T> src_tile{ tile_dims, in.data_type() };
+
+    // Simple tensor for the temporary tile
+    SimpleTensor<T> tmp_tile{ tile_dims, in.data_type() };
+
+    // Simple tensor for the output tile
+    SimpleTensor<T> dst_tile{ tile_dims, in.data_type() };
+
+    // Simple tensor for the transformation matrix
+    SimpleTensor<T> matrix{ tile_dims, in.data_type() };
+
+    // Simple tensor for the transformation matrix transposed
+    SimpleTensor<T> matrix_transposed{ tile_dims, in.data_type() };
+
+    // Initialize matrix for the input transform
+    initialize_matrix_transform(matrix, output_tile_size, kernel_size, WinogradTransformType::INPUT);
+
+    // Transpose matrix
+    transpose_matrix(matrix, matrix_transposed);
+
+    const int in_w        = in.shape().x();
+    const int in_h        = in.shape().y();
+    const int in_d        = in.shape().z();
+    const int out_d       = out.shape().z();
+    const int num_batches = in.shape().total_size() / (in_w * in_h * in_d);
+    const int num_tiles_x = std::ceil((in_w - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>(output_tile_size.width));
+    const int num_tiles_y = std::ceil((in_h - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>(output_tile_size.height));
+    const int step_x      = output_tile_size.width;
+    const int step_y      = output_tile_size.height;
+
+    ARM_COMPUTE_ERROR_ON((num_tiles_x * num_tiles_y) != static_cast<int>(out.shape().y()));
+
+    for(int b = 0; b < num_batches; ++b)
+    {
+        for(int z = 0; z < in_d; ++z)
+        {
+            for(int y = 0; y < num_tiles_y; ++y)
+            {
+                for(int x = 0; x < num_tiles_x; ++x)
+                {
+                    int xi = x * step_x - conv_info.pad_left();
+                    int yi = y * step_y - conv_info.pad_top();
+
+                    // Get the tile from the input tensor
+                    get_tile(in, src_tile, Coordinates(xi, yi, z, b));
+
+                    // Compute the transformation
+                    matrix_multiply(matrix, src_tile, tmp_tile);
+                    matrix_multiply(tmp_tile, matrix_transposed, dst_tile);
+
+                    // Store the output tile across the channels
+                    for(int i = 0; i < out_d; ++i)
+                    {
+                        int xo = z;
+                        int yo = x + y * num_tiles_x;
+                        out[coords2index(out.shape(), Coordinates(xo, yo, i, b))] = dst_tile[i];
+                    }
+                }
+            }
+        }
+    }
+
+    return out;
+}
+
+template <typename T>
+SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format");
+
+    // Create reference
+    SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
+
+    const Size2D output_tile_size = winograd_info.output_tile_size;
+    const Size2D kernel_size      = winograd_info.kernel_size;
+
+    TensorShape kernel_tile_dims(kernel_size.width, kernel_size.height);
+
+    // Calculate dimensions for the tile
+    const unsigned int input_tile_w    = output_tile_size.width + kernel_size.width - 1;
+    const unsigned int input_tile_h    = output_tile_size.height + kernel_size.height - 1;
+    const unsigned int input_tile_area = input_tile_w * input_tile_h;
+
+    // Simple tensor for the input tile
+    SimpleTensor<T> input_tile{ kernel_tile_dims, in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix
+    SimpleTensor<T> trans_matrix{ TensorShape(kernel_tile_dims[0], input_tile_w), in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix transpose
+    SimpleTensor<T> trans_matrix_transposed{ TensorShape(input_tile_w, kernel_tile_dims[0]), in.data_type(), 1 };
+
+    // Simple tensor for the temporary tile
+    SimpleTensor<T> tmp_tile{ TensorShape(kernel_tile_dims[0], input_tile_w), in.data_type(), 1 };
+
+    // Simple tensor for the output tile
+    SimpleTensor<T> transf_tile{ TensorShape(input_tile_w, input_tile_w), in.data_type(), 1 };
+
+    // Initialize matrix for the filter transform
+    initialize_matrix_transform(trans_matrix, output_tile_size, kernel_size, WinogradTransformType::FILTER);
+
+    // Transpose the transformation matrix
+    transpose_matrix(trans_matrix, trans_matrix_transposed);
+
+    const int num_channels = in.shape()[2];
+    const int num_filters  = in.shape()[3];
+    const int num_batches  = in.shape().total_size() / (kernel_size.area() * num_channels * num_filters);
+
+    for(int n = 0; n < num_batches; ++n)
+    {
+        for(int w = 0; w < num_filters; ++w)
+        {
+            for(int z = 0; z < num_channels; ++z)
+            {
+                // Load the tile from the input tensor
+                get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+
+                // First transformation
+                matrix_multiply(trans_matrix, input_tile, tmp_tile);
+
+                // Second transformation
+                matrix_multiply(tmp_tile, trans_matrix_transposed, transf_tile);
+
+                // Store the output tile across the channels
+                const int output_offset = w + z * num_filters;
+
+                // Store the values across the channels
+                for(unsigned int i = 0; i < input_tile_area; ++i)
+                {
+                    out[output_offset + i * num_filters * num_channels] = transf_tile[i];
+                }
+            }
+        }
+    }
+
+    return out;
+}
+
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const SimpleTensor<T> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(winograd_info.output_data_layout != DataLayout::NCHW, "Only supported NCHW data format");
+
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        input_dimensions = winograd_info.input_dimensions;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+
+    // Create reference
+    SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
+
+    // Calculate dimensions for the tiles
+    const unsigned int in_tile_w  = output_tile_size.width + kernel_size.width - 1;
+    const unsigned int in_tile_h  = output_tile_size.height + kernel_size.height - 1;
+    const unsigned int out_tile_w = output_tile_size.width;
+    const unsigned int out_tile_h = output_tile_size.height;
+
+    ARM_COMPUTE_ERROR_ON(in.shape()[2] != (in_tile_w * in_tile_h));
+    ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]);
+
+    // Compute tile dimensions
+    // Input tile dimensions
+    TensorShape in_tile_dims(in_tile_w, in_tile_h);
+
+    // Output tile dimensions
+    TensorShape out_tile_dims(output_tile_size.width, output_tile_size.height);
+
+    // Transformation matrix dimensions
+    TensorShape tr_tile_dims(in_tile_w, output_tile_size.width);
+
+    // Create tensors
+    // Simple tensor for the input tile
+    SimpleTensor<T> input_tile{ in_tile_dims, in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix
+    SimpleTensor<T> trans_matrix{ tr_tile_dims, in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix transpose
+    SimpleTensor<T> trans_matrix_transposed{ TensorShape(tr_tile_dims[1], tr_tile_dims[0]), in.data_type(), 1 };
+
+    // Simple tensor for the temporary tile
+    SimpleTensor<T> tmp_tile{ tr_tile_dims, in.data_type(), 1 };
+
+    // Simple tensor for the output tile
+    SimpleTensor<T> output_tile{ out_tile_dims, in.data_type(), 1 };
+
+    // Initialize matrix for the output transform
+    initialize_matrix_transform(trans_matrix, output_tile_size, kernel_size, WinogradTransformType::OUTPUT);
+
+    // Transpose the transformation matrix
+    transpose_matrix(trans_matrix, trans_matrix_transposed);
+
+    const int w_in        = in.shape()[0];
+    const int h_in        = in.shape()[1];
+    const int c_in        = in.shape()[2];
+    const int w_out       = out.shape()[0];
+    const int h_out       = out.shape()[1];
+    const int c_out       = out.shape()[2];
+    const int num_batches = in.shape().total_size() / (w_in * h_in * c_in);
+
+    // Input strides
+    const int stridey_in = w_in;
+    const int stridez_in = stridey_in * h_in;
+    const int stridew_in = stridez_in * c_in;
+
+    // Output strides
+    const int stridey_out = w_out;
+    const int stridez_out = stridey_out * h_out;
+    const int stridew_out = stridez_out * c_out;
+
+    // Compute number of elements to process in the X and Y direction
+    const int num_elements_x = input_dimensions.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
+    const int num_elements_y = input_dimensions.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
+    const int num_tiles_x    = std::ceil(num_elements_x / static_cast<float>(output_tile_size.width));
+    const int num_tiles_y    = std::ceil(num_elements_y / static_cast<float>(output_tile_size.height));
+
+    ARM_COMPUTE_UNUSED(num_tiles_y);
+    ARM_COMPUTE_ERROR_ON(in.shape()[1] != static_cast<unsigned int>(num_tiles_x * num_tiles_y));
+
+    for(int n = 0; n < num_batches; ++n)
+    {
+        for(int y = 0; y < h_in; ++y)
+        {
+            for(int x = 0; x < w_in; ++x)
+            {
+                // Load the input tile tile across the channels of the input tensor
+                for(int z = 0; z < c_in; ++z)
+                {
+                    input_tile[z] = in[x + (y * stridey_in) + (z * stridez_in) + (n * stridew_in)];
+                }
+
+                // First transformation
+                matrix_multiply(trans_matrix, input_tile, tmp_tile);
+
+                // Second transformation
+                matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
+
+                // Store the output tile
+                const int xo = (y % num_tiles_x) * out_tile_w;
+                const int yo = (y / num_tiles_x) * out_tile_h;
+                const int zo = x;
+
+                const int output_offset = xo + (yo * stridey_out) + (zo * stridez_out) + (n * stridew_out);
+
+                for(int yi = 0; yi < static_cast<int>(out_tile_h); ++yi)
+                {
+                    for(int xi = 0; xi < static_cast<int>(out_tile_w); ++xi)
+                    {
+                        // Check out-of-bound writes
+                        if((xo + xi < w_out) && (yo + yi < h_out))
+                        {
+                            out[output_offset + yi * stridey_out + xi] = output_tile[xi + yi * out_tile_w];
+
+                            // Add bias
+                            out[output_offset + yi * stridey_out + xi] += b[zo];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return out;
+}
+
+template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const SimpleTensor<float> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/Winograd.h b/tests/validation/reference/Winograd.h
new file mode 100644
index 0000000..b74c2c3
--- /dev/null
+++ b/tests/validation/reference/Winograd.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_WINOGRAD_H__
+#define __ARM_COMPUTE_TEST_WINOGRAD_H__
+
+#include "arm_compute/core/TensorShape.h"
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+/** Winograd transform type */
+enum class WinogradTransformType
+{
+    INPUT,  /**< Winograd input transform */
+    FILTER, /**< Winograd filter transform */
+    OUTPUT  /**< Winograd output transform */
+};
+
+template <typename T>
+SimpleTensor<T> winograd_input_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+
+template <typename T>
+SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const SimpleTensor<T> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_WINOGRAD_H__ */