arm_compute v18.11
diff --git a/tests/validation/reference/ActivationLayer.cpp b/tests/validation/reference/ActivationLayer.cpp
index 9455eff..9887e42 100644
--- a/tests/validation/reference/ActivationLayer.cpp
+++ b/tests/validation/reference/ActivationLayer.cpp
@@ -34,7 +34,7 @@
 {
 namespace reference
 {
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+template <typename T>
 SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo info)
 {
     // Create reference
@@ -46,46 +46,7 @@
 
     for(int i = 0; i < src.num_elements(); ++i)
     {
-        T x = src[i];
-
-        switch(info.activation())
-        {
-            case ActivationLayerInfo::ActivationFunction::ABS:
-                dst[i] = std::abs(x);
-                break;
-            case ActivationLayerInfo::ActivationFunction::LINEAR:
-                dst[i] = a * x + b;
-                break;
-            case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                dst[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
-                break;
-            case ActivationLayerInfo::ActivationFunction::RELU:
-                dst[i] = std::max<T>(static_cast<T>(0), x);
-                break;
-            case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                dst[i] = std::min<T>(a, std::max(static_cast<T>(0), x));
-                break;
-            case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                dst[i] = std::min<T>(a, std::max<T>(b, x));
-                break;
-            case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                dst[i] = (x > 0) ? x : a * x;
-                break;
-            case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                dst[i] = std::log(static_cast<T>(1) + std::exp(x));
-                break;
-            case ActivationLayerInfo::ActivationFunction::SQRT:
-                dst[i] = std::sqrt(x);
-                break;
-            case ActivationLayerInfo::ActivationFunction::SQUARE:
-                dst[i] = x * x;
-                break;
-            case ActivationLayerInfo::ActivationFunction::TANH:
-                dst[i] = a * std::tanh(b * x);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-        }
+        dst[i] = activate_float<T>(src[i], a, b, info.activation());
     }
 
     return dst;
diff --git a/tests/validation/reference/ActivationLayer.h b/tests/validation/reference/ActivationLayer.h
index 09f602f..77b3530 100644
--- a/tests/validation/reference/ActivationLayer.h
+++ b/tests/validation/reference/ActivationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,10 +35,55 @@
 {
 namespace reference
 {
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo info);
+template <typename T>
+inline T activate_float(T x, T a, T b, ActivationLayerInfo::ActivationFunction activation)
+{
+    T ret;
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+    switch(activation)
+    {
+        case ActivationLayerInfo::ActivationFunction::ABS:
+            ret = std::abs(x);
+            break;
+        case ActivationLayerInfo::ActivationFunction::LINEAR:
+            ret = a * x + b;
+            break;
+        case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+            ret = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+            break;
+        case ActivationLayerInfo::ActivationFunction::RELU:
+            ret = std::max<T>(static_cast<T>(0), x);
+            break;
+        case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+            ret = std::min<T>(a, std::max(static_cast<T>(0), x));
+            break;
+        case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+            ret = std::min<T>(a, std::max<T>(b, x));
+            break;
+        case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+            ret = (x > 0) ? x : a * x;
+            break;
+        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+            ret = std::log(static_cast<T>(1) + std::exp(x));
+            break;
+        case ActivationLayerInfo::ActivationFunction::SQRT:
+            ret = std::sqrt(x);
+            break;
+        case ActivationLayerInfo::ActivationFunction::SQUARE:
+            ret = x * x;
+            break;
+        case ActivationLayerInfo::ActivationFunction::TANH:
+            ret = a * std::tanh(b * x);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported activation function");
+            break;
+    }
+
+    return ret;
+}
+
+template <typename T>
 SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo info);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/ArithmeticAddition.cpp b/tests/validation/reference/ArithmeticAddition.cpp
deleted file mode 100644
index c68c6d4..0000000
--- a/tests/validation/reference/ArithmeticAddition.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ArithmeticAddition.h"
-
-#include "arm_compute/core/Types.h"
-#include "tests/validation/Helpers.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-namespace
-{
-template <typename T>
-T add(T src1, T src2, ConvertPolicy convert_policy)
-{
-    using intermediate_type = typename common_promoted_signed_type<T>::intermediate_type;
-
-    intermediate_type val = static_cast<intermediate_type>(src1) + static_cast<intermediate_type>(src2);
-
-    T result = (convert_policy == ConvertPolicy::SATURATE) ? saturate_cast<T>(val) : static_cast<T>(val);
-
-    return result;
-}
-
-template <size_t dim>
-struct BroadcastUnroll
-{
-    template <typename T>
-    static void unroll(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst,
-                       ConvertPolicy convert_policy, Coordinates &id_src1, Coordinates &id_src2, Coordinates &id_dst)
-    {
-        const bool src1_is_broadcast = (src1.shape()[dim - 1] != dst.shape()[dim - 1]);
-        const bool src2_is_broadcast = (src2.shape()[dim - 1] != dst.shape()[dim - 1]);
-
-        id_src1.set(dim - 1, 0);
-        id_src2.set(dim - 1, 0);
-        id_dst.set(dim - 1, 0);
-
-        for(size_t i = 0; i < dst.shape()[dim - 1]; ++i, ++id_dst[dim - 1])
-        {
-            BroadcastUnroll < dim - 1 >::unroll(src1, src2, dst, convert_policy, id_src1, id_src2, id_dst);
-
-            id_src1[dim - 1] += !src1_is_broadcast;
-            id_src2[dim - 1] += !src2_is_broadcast;
-        }
-    }
-};
-
-template <>
-struct BroadcastUnroll<0>
-{
-    template <typename T>
-    static void unroll(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst,
-                       ConvertPolicy convert_policy, Coordinates &id_src1, Coordinates &id_src2, Coordinates &id_dst)
-    {
-        dst[coord2index(dst.shape(), id_dst)] = add(src1[coord2index(src1.shape(), id_src1)], src2[coord2index(src2.shape(), id_src2)], convert_policy);
-    }
-};
-} // namespace
-
-template <typename T>
-SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst, ConvertPolicy convert_policy)
-{
-    Coordinates id_src1, id_src2, id_dst;
-
-    BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(src1, src2, dst, convert_policy, id_src1, id_src2, id_dst);
-
-    return dst;
-}
-
-template <>
-SimpleTensor<uint8_t> arithmetic_addition(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, SimpleTensor<uint8_t> &dst, ConvertPolicy convert_policy)
-{
-    if(dst.data_type() == DataType::QASYMM8)
-    {
-        SimpleTensor<float> src1_tmp = convert_from_asymmetric(src1);
-        SimpleTensor<float> src2_tmp = convert_from_asymmetric(src2);
-        SimpleTensor<float> dst_tmp(TensorShape::broadcast_shape(src1.shape(), src2.shape()), dst.data_type());
-
-        Coordinates id_src1, id_src2, id_dst;
-
-        BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(src1_tmp, src2_tmp, dst_tmp, convert_policy, id_src1, id_src2, id_dst);
-
-        dst = convert_to_asymmetric(dst_tmp, dst.quantization_info());
-        return dst;
-    }
-    else
-    {
-        // DataType::U8
-        Coordinates id_src1, id_src2, id_dst;
-
-        BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(src1, src2, dst, convert_policy, id_src1, id_src2, id_dst);
-
-        return dst;
-    }
-}
-
-template SimpleTensor<int16_t> arithmetic_addition(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, SimpleTensor<int16_t> &dst, ConvertPolicy convert_policy);
-template SimpleTensor<int8_t> arithmetic_addition(const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, SimpleTensor<int8_t> &dst, ConvertPolicy convert_policy);
-template SimpleTensor<half> arithmetic_addition(const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, SimpleTensor<half> &dst, ConvertPolicy convert_policy);
-template SimpleTensor<float> arithmetic_addition(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, SimpleTensor<float> &dst, ConvertPolicy convert_policy);
-
-template <typename T>
-SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType dst_data_type, ConvertPolicy convert_policy)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(dst_data_type == DataType::QASYMM8, "For QASYMM8, the quantized output tensor should be passed directly.");
-
-    SimpleTensor<T> dst(TensorShape::broadcast_shape(src1.shape(), src2.shape()), dst_data_type);
-    arithmetic_addition<T>(src1, src2, dst, convert_policy);
-    return dst;
-}
-
-template SimpleTensor<int16_t> arithmetic_addition(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<int8_t> arithmetic_addition(const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<half> arithmetic_addition(const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<float> arithmetic_addition(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticOperations.cpp b/tests/validation/reference/ArithmeticOperations.cpp
new file mode 100644
index 0000000..062be93
--- /dev/null
+++ b/tests/validation/reference/ArithmeticOperations.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ArithmeticOperations.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+template <typename T>
+T arithm_op(ArithmeticOperation op, T src1, T src2, ConvertPolicy convert_policy)
+{
+    using intermediate_type = typename common_promoted_signed_type<T>::intermediate_type;
+
+    intermediate_type val = (op == ArithmeticOperation::ADD) ? static_cast<intermediate_type>(src1) + static_cast<intermediate_type>(src2) : static_cast<intermediate_type>
+                            (src1) - static_cast<intermediate_type>(src2);
+
+    T result = (convert_policy == ConvertPolicy::SATURATE) ? saturate_cast<T>(val) : static_cast<T>(val);
+
+    return result;
+}
+
+template <size_t dim>
+struct BroadcastUnroll
+{
+    template <typename T>
+    static void unroll(ArithmeticOperation op, const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst,
+                       ConvertPolicy convert_policy, Coordinates &id_src1, Coordinates &id_src2, Coordinates &id_dst)
+    {
+        const bool src1_is_broadcast = (src1.shape()[dim - 1] != dst.shape()[dim - 1]);
+        const bool src2_is_broadcast = (src2.shape()[dim - 1] != dst.shape()[dim - 1]);
+
+        id_src1.set(dim - 1, 0);
+        id_src2.set(dim - 1, 0);
+        id_dst.set(dim - 1, 0);
+
+        for(size_t i = 0; i < dst.shape()[dim - 1]; ++i, ++id_dst[dim - 1])
+        {
+            BroadcastUnroll < dim - 1 >::unroll(op, src1, src2, dst, convert_policy, id_src1, id_src2, id_dst);
+
+            id_src1[dim - 1] += !src1_is_broadcast;
+            id_src2[dim - 1] += !src2_is_broadcast;
+        }
+    }
+};
+
+template <>
+struct BroadcastUnroll<0>
+{
+    template <typename T>
+    static void unroll(ArithmeticOperation op, const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst,
+                       ConvertPolicy convert_policy, Coordinates &id_src1, Coordinates &id_src2, Coordinates &id_dst)
+    {
+        dst[coord2index(dst.shape(), id_dst)] = arithm_op(op, src1[coord2index(src1.shape(), id_src1)], src2[coord2index(src2.shape(), id_src2)], convert_policy);
+    }
+};
+} // namespace
+
+template <typename T>
+SimpleTensor<T> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst, ConvertPolicy convert_policy)
+{
+    Coordinates id_src1, id_src2, id_dst;
+
+    BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(op, src1, src2, dst, convert_policy, id_src1, id_src2, id_dst);
+
+    return dst;
+}
+
+template <>
+SimpleTensor<uint8_t> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, SimpleTensor<uint8_t> &dst, ConvertPolicy convert_policy)
+{
+    if(dst.data_type() == DataType::QASYMM8)
+    {
+        SimpleTensor<float> src1_tmp = convert_from_asymmetric(src1);
+        SimpleTensor<float> src2_tmp = convert_from_asymmetric(src2);
+        SimpleTensor<float> dst_tmp(TensorShape::broadcast_shape(src1.shape(), src2.shape()), dst.data_type());
+
+        Coordinates id_src1, id_src2, id_dst;
+
+        BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(op, src1_tmp, src2_tmp, dst_tmp, convert_policy, id_src1, id_src2, id_dst);
+
+        dst = convert_to_asymmetric(dst_tmp, dst.quantization_info());
+        return dst;
+    }
+    else
+    {
+        // DataType::U8
+        Coordinates id_src1, id_src2, id_dst;
+
+        BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(op, src1, src2, dst, convert_policy, id_src1, id_src2, id_dst);
+
+        return dst;
+    }
+}
+
+template SimpleTensor<int16_t> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, SimpleTensor<int16_t> &dst,
+                                                    ConvertPolicy convert_policy);
+template SimpleTensor<int8_t> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, SimpleTensor<int8_t> &dst, ConvertPolicy convert_policy);
+template SimpleTensor<half> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, SimpleTensor<half> &dst, ConvertPolicy convert_policy);
+template SimpleTensor<float> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, SimpleTensor<float> &dst, ConvertPolicy convert_policy);
+
+template <typename T>
+SimpleTensor<T> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType dst_data_type, ConvertPolicy convert_policy)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(dst_data_type == DataType::QASYMM8, "For QASYMM8, the quantized output tensor should be passed directly.");
+
+    SimpleTensor<T> dst(TensorShape::broadcast_shape(src1.shape(), src2.shape()), dst_data_type);
+    arithmetic_operation<T>(op, src1, src2, dst, convert_policy);
+    return dst;
+}
+
+template SimpleTensor<int16_t> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template SimpleTensor<int8_t> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template SimpleTensor<half> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template SimpleTensor<float> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticAddition.h b/tests/validation/reference/ArithmeticOperations.h
similarity index 70%
copy from tests/validation/reference/ArithmeticAddition.h
copy to tests/validation/reference/ArithmeticOperations.h
index faeabd7..7363482 100644
--- a/tests/validation/reference/ArithmeticAddition.h
+++ b/tests/validation/reference/ArithmeticOperations.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__
+#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_OPERATIONS_H__
+#define __ARM_COMPUTE_TEST_ARITHMETIC_OPERATIONS_H__
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -35,14 +35,20 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst, ConvertPolicy convert_policy);
+/** Arithmetic operation types */
+enum class ArithmeticOperation
+{
+    ADD,
+    SUB
+};
 
 template <typename T>
-SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+SimpleTensor<T> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst, ConvertPolicy convert_policy);
 
+template <typename T>
+SimpleTensor<T> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__ */
+#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_OPERATIONS_H__ */
diff --git a/tests/validation/reference/ArithmeticSubtraction.cpp b/tests/validation/reference/ArithmeticSubtraction.cpp
deleted file mode 100644
index f39d01f..0000000
--- a/tests/validation/reference/ArithmeticSubtraction.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ArithmeticSubtraction.h"
-
-#include "tests/validation/Helpers.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T1, typename T2, typename T3>
-SimpleTensor<T3> arithmetic_subtraction(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, DataType dst_data_type, ConvertPolicy convert_policy)
-{
-    SimpleTensor<T3> result(src1.shape(), dst_data_type);
-
-    using intermediate_type = typename common_promoted_signed_type<typename std::conditional<sizeof(T1) >= sizeof(T2), T1, T2>::type >::intermediate_type;
-
-    for(int i = 0; i < src1.num_elements(); ++i)
-    {
-        intermediate_type val = static_cast<intermediate_type>(src1[i]) - static_cast<intermediate_type>(src2[i]);
-        result[i]             = (convert_policy == ConvertPolicy::SATURATE) ? saturate_cast<T3>(val) : static_cast<T3>(val);
-    }
-
-    return result;
-}
-
-template SimpleTensor<uint8_t> arithmetic_subtraction(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<int16_t> arithmetic_subtraction(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<int16_t> arithmetic_subtraction(const SimpleTensor<uint8_t> &src1, const SimpleTensor<int16_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<int16_t> arithmetic_subtraction(const SimpleTensor<int16_t> &src1, const SimpleTensor<uint8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<int16_t> arithmetic_subtraction(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<int8_t> arithmetic_subtraction(const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<half> arithmetic_subtraction(const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<float> arithmetic_subtraction(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/BatchNormalizationLayer.cpp b/tests/validation/reference/BatchNormalizationLayer.cpp
index 4ea3769..37713c8 100644
--- a/tests/validation/reference/BatchNormalizationLayer.cpp
+++ b/tests/validation/reference/BatchNormalizationLayer.cpp
@@ -77,7 +77,6 @@
 template SimpleTensor<half> batch_normalization_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &mean, const SimpleTensor<half> &var,
                                                       const SimpleTensor<half> &beta,
                                                       const SimpleTensor<half> &gamma, float epsilon, ActivationLayerInfo act_info);
-
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/BatchNormalizationLayer.h b/tests/validation/reference/BatchNormalizationLayer.h
index b45d820..a5d99c1 100644
--- a/tests/validation/reference/BatchNormalizationLayer.h
+++ b/tests/validation/reference/BatchNormalizationLayer.h
@@ -35,10 +35,6 @@
 {
 namespace reference
 {
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type * = nullptr>
-SimpleTensor<T> batch_normalization_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &mean, const SimpleTensor<T> &var, const SimpleTensor<T> &beta, const SimpleTensor<T> &gamma, float epsilon,
-                                          ActivationLayerInfo act_info);
-
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type * = nullptr>
 SimpleTensor<T> batch_normalization_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &mean, const SimpleTensor<T> &var, const SimpleTensor<T> &beta, const SimpleTensor<T> &gamma, float epsilon,
                                           ActivationLayerInfo act_info);
diff --git a/tests/validation/reference/BatchToSpaceLayer.cpp b/tests/validation/reference/BatchToSpaceLayer.cpp
new file mode 100644
index 0000000..662a707
--- /dev/null
+++ b/tests/validation/reference/BatchToSpaceLayer.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "BatchToSpaceLayer.h"
+
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+// Batch to Space
+template <typename T>
+SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape)
+{
+    ARM_COMPUTE_ERROR_ON(block_shape[0] <= 0);
+    ARM_COMPUTE_ERROR_ON(block_shape[1] <= 0);
+    SimpleTensor<T> result(dst_shape, src.data_type());
+
+    int        in_pos    = 0;
+    const auto width_in  = static_cast<int>(src.shape()[0]);
+    const auto height_in = static_cast<int>(src.shape()[1]);
+    const auto z_in      = static_cast<int>(src.shape()[2]);
+    const auto batch_in  = static_cast<int>(src.shape()[3]);
+
+    for(int batch = 0; batch < batch_in; ++batch)
+    {
+        for(int z = 0; z < z_in; ++z)
+        {
+            for(int y = 0; y < height_in; ++y)
+            {
+                for(int x = 0; x < width_in; ++x)
+                {
+                    const int r       = src.shape()[3] / (block_shape[0] * block_shape[1]);
+                    const int out_x   = (block_shape[0] * x + (batch / r) % block_shape[0]);
+                    const int out_y   = (block_shape[1] * y + (batch / r) / block_shape[0]);
+                    const int out_pos = out_x + dst_shape[0] * out_y + z * dst_shape[0] * dst_shape[1] + (batch % r) * dst_shape[0] * dst_shape[1] * dst_shape[2];
+                    result[out_pos]   = src[in_pos];
+                    ++in_pos;
+                }
+            }
+        }
+    }
+
+    return result;
+}
+template SimpleTensor<float> batch_to_space(const SimpleTensor<float> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape);
+template SimpleTensor<half> batch_to_space(const SimpleTensor<half> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticSubtraction.h b/tests/validation/reference/BatchToSpaceLayer.h
similarity index 72%
copy from tests/validation/reference/ArithmeticSubtraction.h
copy to tests/validation/reference/BatchToSpaceLayer.h
index 9308314..d17cbe5 100644
--- a/tests/validation/reference/ArithmeticSubtraction.h
+++ b/tests/validation/reference/BatchToSpaceLayer.h
@@ -1,9 +1,9 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * Permission is hereby granted, free of charge, to any  person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
+#ifndef __ARM_COMPUTE_TEST_BATCH_TO_SPACE_LAYER_H__
+#define __ARM_COMPUTE_TEST_BATCH_TO_SPACE_LAYER_H__
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -35,10 +35,10 @@
 {
 namespace reference
 {
-template <typename T1, typename T2, typename T3>
-SimpleTensor<T3> arithmetic_subtraction(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template <typename T>
+SimpleTensor<T> batch_to_space(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &block_shape, const TensorShape &dst_shape);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__ */
+#endif /* __ARM_COMPUTE_TEST_BATCH_TO_SPACE_LAYER_H__ */
diff --git a/tests/validation/reference/BoundingBoxTransform.cpp b/tests/validation/reference/BoundingBoxTransform.cpp
new file mode 100644
index 0000000..55dd165
--- /dev/null
+++ b/tests/validation/reference/BoundingBoxTransform.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "BoundingBoxTransform.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/misc/Utility.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> bounding_box_transform(const SimpleTensor<T> &boxes, const SimpleTensor<T> &deltas, const BoundingBoxTransformInfo &info)
+{
+    const DataType  boxes_data_type = deltas.data_type();
+    SimpleTensor<T> pred_boxes(deltas.shape(), boxes_data_type);
+
+    const size_t num_classes    = deltas.shape()[0] / 4;
+    const size_t num_boxes      = deltas.shape()[1];
+    const T     *deltas_ptr     = deltas.data();
+    T           *pred_boxes_ptr = pred_boxes.data();
+
+    const int img_h = floor(info.img_height() / info.scale() + 0.5f);
+    const int img_w = floor(info.img_width() / info.scale() + 0.5f);
+
+    const auto scale_after  = (info.apply_scale() ? T(info.scale()) : T(1));
+    const auto scale_before = T(info.scale());
+    ARM_COMPUTE_ERROR_ON(scale_before <= 0);
+    const auto offset = (info.correct_transform_coords() ? T(1.f) : T(0.f));
+
+    const size_t box_fields   = 4;
+    const size_t class_fields = 4;
+
+    for(size_t i = 0; i < num_boxes; ++i)
+    {
+        // Extract ROI information
+        const size_t start_box = box_fields * i;
+        const T      width     = (boxes[start_box + 2] / scale_before) - (boxes[start_box] / scale_before) + T(1.f);
+        const T      height    = (boxes[start_box + 3] / scale_before) - (boxes[start_box + 1] / scale_before) + T(1.f);
+        const T      ctr_x     = (boxes[start_box] / scale_before) + T(0.5f) * width;
+        const T      ctr_y     = (boxes[start_box + 1] / scale_before) + T(0.5f) * height;
+
+        for(size_t j = 0; j < num_classes; ++j)
+        {
+            // Extract deltas
+            const size_t start_delta = i * num_classes * class_fields + class_fields * j;
+            const T      dx          = deltas_ptr[start_delta] / T(info.weights()[0]);
+            const T      dy          = deltas_ptr[start_delta + 1] / T(info.weights()[1]);
+            T            dw          = deltas_ptr[start_delta + 2] / T(info.weights()[2]);
+            T            dh          = deltas_ptr[start_delta + 3] / T(info.weights()[3]);
+
+            // Clip dw and dh
+            dw = std::min(dw, T(info.bbox_xform_clip()));
+            dh = std::min(dh, T(info.bbox_xform_clip()));
+
+            // Determine the predictions
+            const T pred_ctr_x = dx * width + ctr_x;
+            const T pred_ctr_y = dy * height + ctr_y;
+            const T pred_w     = T(std::exp(dw)) * width;
+            const T pred_h     = T(std::exp(dh)) * height;
+
+            // Store the prediction into the output tensor
+            pred_boxes_ptr[start_delta]     = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
+            pred_boxes_ptr[start_delta + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
+            pred_boxes_ptr[start_delta + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
+            pred_boxes_ptr[start_delta + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
+        }
+    }
+    return pred_boxes;
+}
+
+template SimpleTensor<float> bounding_box_transform(const SimpleTensor<float> &boxes, const SimpleTensor<float> &deltas, const BoundingBoxTransformInfo &info);
+template SimpleTensor<half> bounding_box_transform(const SimpleTensor<half> &boxes, const SimpleTensor<half> &deltas, const BoundingBoxTransformInfo &info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticSubtraction.h b/tests/validation/reference/BoundingBoxTransform.h
similarity index 73%
copy from tests/validation/reference/ArithmeticSubtraction.h
copy to tests/validation/reference/BoundingBoxTransform.h
index 9308314..33ef9d9 100644
--- a/tests/validation/reference/ArithmeticSubtraction.h
+++ b/tests/validation/reference/BoundingBoxTransform.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
+#ifndef __ARM_COMPUTE_TEST_BOUNDINGBOXTRANSFORM_H__
+#define __ARM_COMPUTE_TEST_BOUNDINGBOXTRANSFORM_H__
 
-#include "tests/SimpleTensor.h"
+#include "BoundingBoxTransform.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -35,10 +38,10 @@
 {
 namespace reference
 {
-template <typename T1, typename T2, typename T3>
-SimpleTensor<T3> arithmetic_subtraction(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template <typename T>
+SimpleTensor<T> bounding_box_transform(const SimpleTensor<T> &boxes, const SimpleTensor<T> &deltas, const BoundingBoxTransformInfo &info);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__ */
+#endif /* __ARM_COMPUTE_TEST_BOUNDINGBOXTRANSFORM_H__ */
diff --git a/tests/validation/reference/CannyEdgeDetector.cpp b/tests/validation/reference/CannyEdgeDetector.cpp
index cfe8ae8..92a11db 100644
--- a/tests/validation/reference/CannyEdgeDetector.cpp
+++ b/tests/validation/reference/CannyEdgeDetector.cpp
@@ -31,8 +31,6 @@
 #include "tests/validation/reference/Phase.h"
 #include "tests/validation/reference/Sobel.h"
 
-#include "tests/SimpleTensorPrinter.h"
-
 #include <cmath>
 #include <stack>
 
@@ -231,7 +229,8 @@
 } // namespace
 
 template <typename T>
-SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src,
+                                    int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
                                     BorderMode border_mode, T constant_border_value)
 {
     if(gradient_size < 7)
@@ -244,7 +243,8 @@
     }
 }
 
-template SimpleTensor<uint8_t> canny_edge_detector(const SimpleTensor<uint8_t> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+template SimpleTensor<uint8_t> canny_edge_detector(const SimpleTensor<uint8_t> &src,
+                                                   int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
                                                    BorderMode border_mode, uint8_t constant_border_value);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/CannyEdgeDetector.h b/tests/validation/reference/CannyEdgeDetector.h
index a46c145..ee6199d 100644
--- a/tests/validation/reference/CannyEdgeDetector.h
+++ b/tests/validation/reference/CannyEdgeDetector.h
@@ -36,7 +36,8 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src,
+                                    int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
                                     BorderMode border_mode, T constant_border_value = 0);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/Col2Im.cpp b/tests/validation/reference/Col2Im.cpp
index 90e488f..53969d4 100644
--- a/tests/validation/reference/Col2Im.cpp
+++ b/tests/validation/reference/Col2Im.cpp
@@ -40,7 +40,7 @@
     SimpleTensor<T> dst{ dst_shape, src.data_type(), 1 };
 
     // Compute reference
-    const size_t batches    = dst_shape[3];
+    const size_t batches    = dst_shape.total_size() / (dst_shape.x() * dst_shape.y() * dst_shape.z());
     const size_t src_width  = src.shape().x();
     const size_t src_height = src.shape().y();
 
diff --git a/tests/validation/reference/ColorConvert.cpp b/tests/validation/reference/ColorConvert.cpp
index 8047b34..9090319 100644
--- a/tests/validation/reference/ColorConvert.cpp
+++ b/tests/validation/reference/ColorConvert.cpp
@@ -46,6 +46,7 @@
 
     switch(format)
     {
+        case Format::U8:
         case Format::RGB888:
         case Format::RGBA8888:
         case Format::YUYV422:
@@ -102,6 +103,9 @@
                 case Format::RGBA8888:
                     colorconvert_helper::detail::colorconvert_rgb_to_rgbx(tensor_planes[0], dst[0]);
                     break;
+                case Format::U8:
+                    colorconvert_helper::detail::colorconvert_rgb_to_u8(tensor_planes[0], dst[0]);
+                    break;
                 case Format::NV12:
                     colorconvert_helper::detail::colorconvert_rgb_to_nv12(tensor_planes[0], dst);
                     break;
diff --git a/tests/validation/reference/ColorConvertHelper.h b/tests/validation/reference/ColorConvertHelper.h
index 7a8b547..b2ae6f2 100644
--- a/tests/validation/reference/ColorConvertHelper.h
+++ b/tests/validation/reference/ColorConvertHelper.h
@@ -48,6 +48,10 @@
 // C_v = 1 / (2 * (1 - K_r))
 constexpr float rgb2yuv_bt709_cv = 0.6350f;
 
+constexpr float rgb2u8_red_coef   = 0.2126f;
+constexpr float rgb2u8_green_coef = 0.7152f;
+constexpr float rgb2u8_blue_coef  = 0.0722f;
+
 template <typename T>
 inline void store_rgb_from_src(const SimpleTensor<T> src, SimpleTensor<T> &rvec, SimpleTensor<T> &gvec, SimpleTensor<T> &bvec)
 {
@@ -219,6 +223,29 @@
 }
 
 template <typename T>
+inline void colorconvert_rgb_to_u8(const SimpleTensor<T> src, SimpleTensor<T> &dst)
+{
+    const int width  = dst.shape().x();
+    const int height = dst.shape().y();
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; ++x)
+        {
+            const Coordinates src_coord{ x, y };
+            const Coordinates dst_coord{ x, y };
+
+            const auto *src_pixel = reinterpret_cast<const T *>(src(src_coord));
+            auto       *dst_pixel = reinterpret_cast<T *>(dst(dst_coord));
+
+            const float result = rgb2u8_red_coef * src_pixel[0] + rgb2u8_green_coef * src_pixel[1] + rgb2u8_blue_coef * src_pixel[2];
+
+            dst_pixel[0] = utility::clamp<float>(result, 0, 255);
+        }
+    }
+}
+
+template <typename T>
 inline void colorconvert_rgbx_to_rgb(const SimpleTensor<T> src, SimpleTensor<T> &dst)
 {
     for(int channel_idx = 0; channel_idx < dst.num_channels(); ++channel_idx)
diff --git a/tests/validation/reference/ComputeAllAnchors.cpp b/tests/validation/reference/ComputeAllAnchors.cpp
new file mode 100644
index 0000000..48f4767
--- /dev/null
+++ b/tests/validation/reference/ComputeAllAnchors.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ComputeAllAnchors.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/misc/Utility.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> compute_all_anchors(const SimpleTensor<T> &anchors, const ComputeAnchorsInfo &info)
+{
+    const int   num_anchors = anchors.shape()[1];
+    const auto  width       = int(info.feat_width());
+    const auto  height      = int(info.feat_height());
+    const float stride      = 1. / info.spatial_scale();
+
+    SimpleTensor<T> all_anchors(TensorShape(4, width * height * num_anchors), anchors.data_type());
+    const T        *anchors_ptr     = anchors.data();
+    T              *all_anchors_ptr = all_anchors.data();
+
+    // Iterate over the input grid and anchors
+    for(int y = 0; y < height; y++)
+    {
+        for(int x = 0; x < width; x++)
+        {
+            for(int a = 0; a < num_anchors; a++)
+            {
+                const T      shift_x   = T(x) * T(stride);
+                const T      shift_y   = T(y) * T(stride);
+                const size_t anchor_id = a + x * num_anchors + y * width * num_anchors;
+                // x1
+                all_anchors_ptr[anchor_id * 4] = anchors_ptr[4 * a] + shift_x;
+                // y1
+                all_anchors_ptr[anchor_id * 4 + 1] = anchors_ptr[4 * a + 1] + shift_y;
+                // x2
+                all_anchors_ptr[anchor_id * 4 + 2] = anchors_ptr[4 * a + 2] + shift_x;
+                // y2
+                all_anchors_ptr[anchor_id * 4 + 3] = anchors_ptr[4 * a + 3] + shift_y;
+            }
+        }
+    }
+    return all_anchors;
+}
+template SimpleTensor<float> compute_all_anchors(const SimpleTensor<float> &anchors, const ComputeAnchorsInfo &info);
+template SimpleTensor<half> compute_all_anchors(const SimpleTensor<half> &anchors, const ComputeAnchorsInfo &info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticSubtraction.h b/tests/validation/reference/ComputeAllAnchors.h
similarity index 75%
copy from tests/validation/reference/ArithmeticSubtraction.h
copy to tests/validation/reference/ComputeAllAnchors.h
index 9308314..b21bf3c 100644
--- a/tests/validation/reference/ArithmeticSubtraction.h
+++ b/tests/validation/reference/ComputeAllAnchors.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
+#ifndef __ARM_COMPUTE_TEST_COMPUTEALLANCHORS_H__
+#define __ARM_COMPUTE_TEST_COMPUTEALLANCHORS_H__
 
-#include "tests/SimpleTensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -35,10 +36,10 @@
 {
 namespace reference
 {
-template <typename T1, typename T2, typename T3>
-SimpleTensor<T3> arithmetic_subtraction(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template <typename T>
+SimpleTensor<T> compute_all_anchors(const SimpleTensor<T> &anchors, const ComputeAnchorsInfo &info);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__ */
+#endif /* __ARM_COMPUTE_TEST_COMPUTEALLANCHORS_H__ */
diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index 7dbdba9..f41a6fc 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp
@@ -139,4 +139,4 @@
 } // namespace reference
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/validation/reference/DeconvolutionLayer.cpp b/tests/validation/reference/DeconvolutionLayer.cpp
index e73023e..5ca3b44 100644
--- a/tests/validation/reference/DeconvolutionLayer.cpp
+++ b/tests/validation/reference/DeconvolutionLayer.cpp
@@ -33,19 +33,31 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<T> &bias, const TensorShape &output_shape,
+template <typename T, typename TB>
+SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape,
                                     const PadStrideInfo &info, const std::pair<unsigned int, unsigned int> &a)
 {
     // Create reference
-    const int   stride_x     = info.stride().first;
-    const int   stride_y     = info.stride().second;
+    const int stride_x           = info.stride().first;
+    const int stride_y           = info.stride().second;
+    const int weights_width      = weights.shape().x();
+    const int weights_height     = weights.shape().y();
+    const int weights_upper_dims = weights.shape().total_size() / (weights_width * weights_height);
+
+    // Find the upsampled dimensions
+    unsigned int out_x = (src.shape().x() - 1) * stride_x + a.first + 1;
+    unsigned int out_y = (src.shape().y() - 1) * stride_y + a.second + 1;
+
+    // Find the padding needed for the convolution with stride 1 in order to match output shape
+    unsigned int padx = output_shape.x() - (out_x - weights_width + 1);
+    unsigned int pady = output_shape.y() - (out_y - weights_height + 1);
+    out_x += padx;
+    out_y += pady;
+
     TensorShape scaled_shape = src.shape();
-    int         out_x        = src.shape().x() + (src.shape().x() - 1) * (stride_x - 1) + a.first + 2 * info.pad().first;
-    int         out_y        = src.shape().y() + (src.shape().y() - 1) * (stride_y - 1) + a.second + 2 * info.pad().second;
     scaled_shape.set(0, out_x);
     scaled_shape.set(1, out_y);
-    SimpleTensor<T> scaled{ scaled_shape, src.data_type(), 1 };
+    SimpleTensor<T> scaled{ scaled_shape, src.data_type(), 1, src.quantization_info() };
 
     const int width_in      = src.shape().x();
     const int height_in     = src.shape().y();
@@ -59,19 +71,38 @@
     ARM_COMPUTE_ERROR_ON_MSG(ax > stride_x - 1, "ax must be smaller than stride_x");
     ARM_COMPUTE_ERROR_ON_MSG(ay > stride_y - 1, "ay must be smaller than stride_y");
 
-    for(int j = 0; j < scaled.num_elements(); ++j)
+    if(src.data_type() == DataType::QASYMM8)
     {
-        scaled[j] = T(0);
+        const uint8_t quantized_zero = src.quantization_info().offset;
+        std::fill_n(scaled.data(), scaled.num_elements(), quantized_zero);
+    }
+    else
+    {
+        std::fill_n(scaled.data(), scaled.num_elements(), T(0));
+    }
+
+    // Flip weights by 180 degrees
+    SimpleTensor<T> weights_flipped{ weights.shape(), weights.data_type(), 1, weights.quantization_info() };
+    for(int ud = 0; ud < weights_upper_dims; ++ud)
+    {
+        const int offset = ud * weights_width * weights_height;
+        for(int y = 0; y < weights_height; ++y)
+        {
+            for(int x = 0; x < weights_width; ++x)
+            {
+                weights_flipped[offset + (weights_height - 1 - y) * weights_width + (weights_width - 1 - x)] = weights[offset + y * weights_width + x];
+            }
+        }
     }
 
     for(int slice = 0; slice < num_2d_slices; ++slice)
     {
         const int offset_slice_in  = slice * width_in * height_in;
         const int offset_slice_out = slice * width_scaled * height_scaled;
-        const int start_x          = info.pad().first;
-        const int start_y          = ay + info.pad().second;
-        const int end_y            = height_scaled - info.pad().second;
-        const int end_x            = width_scaled - ax - info.pad().first;
+        const int start_x          = padx / 2;
+        const int start_y          = ay + pady / 2;
+        const int end_y            = height_scaled - pady / 2;
+        const int end_x            = width_scaled - ax - padx / 2;
 
         for(int yi = start_y, in_y = 0; yi < end_y; yi += stride_y, in_y++)
         {
@@ -85,9 +116,11 @@
     }
 
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-    return convolution_layer(scaled, weights, bias, output_shape, conv_info);
+    return convolution_layer(scaled, weights_flipped, bias, output_shape, conv_info);
 }
 
+template SimpleTensor<uint8_t> deconvolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
+                                                   const PadStrideInfo &info, const std::pair<unsigned int, unsigned int> &a);
 template SimpleTensor<float> deconvolution_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
                                                  const PadStrideInfo &info, const std::pair<unsigned int, unsigned int> &a);
 template SimpleTensor<half> deconvolution_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, const TensorShape &output_shape,
diff --git a/tests/validation/reference/DeconvolutionLayer.h b/tests/validation/reference/DeconvolutionLayer.h
index c0bc1fa..95fb416 100644
--- a/tests/validation/reference/DeconvolutionLayer.h
+++ b/tests/validation/reference/DeconvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,8 +45,8 @@
  * a                The number of zeros added to right and top edges of the input.
  *
  */
-template <typename T>
-SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<T> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
+template <typename T, typename TB>
+SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
                                     const std::pair<unsigned int, unsigned int> &a);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/EqualizeHistogram.cpp b/tests/validation/reference/EqualizeHistogram.cpp
index 0e966cd..1a10c2c 100644
--- a/tests/validation/reference/EqualizeHistogram.cpp
+++ b/tests/validation/reference/EqualizeHistogram.cpp
@@ -66,7 +66,7 @@
     }
     else
     {
-        const float diff = total_num_pixels - 1;
+        const float diff = total_num_pixels - cd_min;
 
         for(size_t i = 0; i < num_bins; ++i)
         {
diff --git a/tests/validation/reference/Floor.cpp b/tests/validation/reference/Floor.cpp
index 1c73944..b011a16 100644
--- a/tests/validation/reference/Floor.cpp
+++ b/tests/validation/reference/Floor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,6 +50,7 @@
     return dst;
 }
 
+template SimpleTensor<half> floor_layer(const SimpleTensor<half> &src);
 template SimpleTensor<float> floor_layer(const SimpleTensor<float> &src);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/GEMMLowp.cpp b/tests/validation/reference/GEMMLowp.cpp
index 8e41aef..9a7e409 100644
--- a/tests/validation/reference/GEMMLowp.cpp
+++ b/tests/validation/reference/GEMMLowp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -98,41 +98,52 @@
 } // namespace
 
 template <typename T_out, typename T_in>
-SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, const SimpleTensor<T_in> &b, int32_t a_offset, int32_t b_offset)
+SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, const SimpleTensor<T_in> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
 {
     static_assert(std::is_same<typename std::decay<T_out>::type, int32_t>::value, "Only int32_t is allowed for the output");
 
-    TensorShape         shape(b.shape()[0], a.shape()[1]);
     DataType            dt = std::is_same<T_out, int32_t>::value ? DataType::S32 : DataType::U32;
-    SimpleTensor<T_out> c(shape, dt);
+    SimpleTensor<T_out> c(shape_c, dt);
 
-    const int K       = a.shape().x();
-    const int b_width = b.shape().x();
-    const int rows    = c.shape().y(); //M
-    const int cols    = c.shape().x(); //N
+    const int K = a.shape().x();
+    const int M = a.shape().y();
+    const int N = b.shape().x();
+    const int D = a.shape().z(); // Number of matrices in a batch
+
+    const int a_stride_z = K * M;
+    // Do not slide the matrix B along the 3rd dimension in case matrix B has less than 3 dimensions
+    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;
+    const int c_stride_z = N * M;
 
     std::vector<T_out> acc;
-    acc.resize(cols);
+    acc.resize(N);
 
-    for(int i = 0; i < rows; ++i)
+    for(int depth = 0; depth < D; ++depth)
     {
-        for(int j = 0; j < cols; ++j)
+        const int base_addr_a = depth * a_stride_z;
+        const int base_addr_b = depth * b_stride_z;
+        const int base_addr_c = depth * c_stride_z;
+
+        for(int i = 0; i < M; ++i)
         {
-            acc[j] = 0;
-        }
-        for(int k = 0; k < K; ++k)
-        {
-            const T_out tmp_a = a_offset + static_cast<T_out>(a[k + i * K]);
-            for(int j = 0; j < b_width; ++j)
+            for(int j = 0; j < N; ++j)
             {
-                const T_out tmp_b       = b_offset + static_cast<T_out>(b[j + k * b_width]);
-                const T_out mult_as_int = tmp_a * tmp_b;
-                acc[j] += mult_as_int;
+                acc[j] = 0;
             }
-        }
-        for(int j = 0; j < cols; ++j)
-        {
-            c[j + i * cols] = acc[j];
+            for(int k = 0; k < K; ++k)
+            {
+                const T_out tmp_a = a_offset + static_cast<T_out>(a[base_addr_a + k + i * K]);
+                for(int j = 0; j < N; ++j)
+                {
+                    const T_out tmp_b       = b_offset + static_cast<T_out>(b[base_addr_b + j + k * N]);
+                    const T_out mult_as_int = tmp_a * tmp_b;
+                    acc[j] += mult_as_int;
+                }
+            }
+            for(int j = 0; j < N; ++j)
+            {
+                c[base_addr_c + j + i * N] = acc[j];
+            }
         }
     }
 
@@ -141,9 +152,9 @@
 
 // used to validate assembly kernels which don't know anything about offsets
 template <typename T1, typename T2>
-SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b)
+SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b, TensorShape shape_c)
 {
-    return gemmlowp_matrix_multiply_core<T1, T2>(a, b, 0, 0);
+    return gemmlowp_matrix_multiply_core<T1, T2>(a, b, shape_c, 0, 0);
 }
 
 template <typename T>
@@ -198,10 +209,10 @@
                                                                            int32_t max);
 template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, int32_t result_mult_int,
                                                                            int32_t result_shift, int32_t min, int32_t max);
-template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, int32_t a_offset, int32_t b_offset);
-template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, int32_t a_offset, int32_t b_offset);
-template SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b);
-template SimpleTensor<int32_t> gemmlowp(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b);
+template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
+template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
+template SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
+template SimpleTensor<int32_t> gemmlowp(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/GEMMLowp.h b/tests/validation/reference/GEMMLowp.h
index a3d0beb..4396155 100644
--- a/tests/validation/reference/GEMMLowp.h
+++ b/tests/validation/reference/GEMMLowp.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,13 +38,13 @@
 template <typename T>
 SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min = 0, int32_t max = 0);
 template <typename T1, typename T2>
-SimpleTensor<T1> gemmlowp_matrix_multiply_core(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b, int32_t a_offset, int32_t b_offset);
+SimpleTensor<T1> gemmlowp_matrix_multiply_core(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
 
 template <typename T>
 SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift);
 
 template <typename T1, typename T2>
-SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b);
+SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b, TensorShape shape_c);
 
 template <typename T>
 SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
diff --git a/tests/validation/reference/HarrisCornerDetector.h b/tests/validation/reference/HarrisCornerDetector.h
index 042e857..f208eaa 100644
--- a/tests/validation/reference/HarrisCornerDetector.h
+++ b/tests/validation/reference/HarrisCornerDetector.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,8 +38,9 @@
 namespace reference
 {
 template <typename T>
-std::vector<KeyPoint> harris_corner_detector(const SimpleTensor<T> &src, float threshold, float min_dist, float sensitivity, int gradient_size, int block_size, BorderMode border_mode,
-                                             T constant_border_value = 0);
+std::vector<KeyPoint> harris_corner_detector(const SimpleTensor<T> &src,
+                                             float threshold, float min_dist, float sensitivity, int gradient_size, int block_size,
+                                             BorderMode border_mode, T constant_border_value = 0);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Im2Col.cpp b/tests/validation/reference/Im2Col.cpp
index 0c41d88..076b2ab 100644
--- a/tests/validation/reference/Im2Col.cpp
+++ b/tests/validation/reference/Im2Col.cpp
@@ -93,52 +93,6 @@
 void im2col_nhwc(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
     ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NHWC);
-    const int pad_x         = conv_info.pad().first;
-    const int pad_y         = conv_info.pad().second;
-    const int stride_x      = conv_info.stride().first;
-    const int stride_y      = conv_info.stride().second;
-    const int kernel_width  = kernel_dims.width;
-    const int kernel_height = kernel_dims.height;
-    const int src_width     = src.shape().y();
-    const int src_height    = src.shape().z();
-    const int src_depth     = src.shape().x();
-    const int batches       = src.shape().total_size_upper(3);
-    const int pad_val       = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0;
-    int       dst_idx       = 0;
-
-    const int lasty = src_height + (kernel_height > 1 ? pad_y : 0) - kernel_height;
-    const int lastx = src_width + (kernel_width > 1 ? pad_x : 0) - kernel_width;
-
-    for(int b = 0; b < batches; ++b)
-    {
-        for(int y = -pad_y; y <= lasty; y += stride_y)
-        {
-            for(int x = -pad_x; x <= lastx; x += stride_x)
-            {
-                for(int z = 0; z < src_depth; ++z)
-                {
-                    for(int patch_y = y; patch_y < (y + kernel_height); ++patch_y)
-                    {
-                        for(int patch_x = x; patch_x < (x + kernel_width); ++patch_x)
-                        {
-                            dst[dst_idx++] = tensor_elem_at(src, Coordinates(z, patch_x, patch_y, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
-                        }
-                    }
-                }
-
-                if(has_bias)
-                {
-                    dst[dst_idx++] = static_cast<T>(1);
-                }
-            }
-        }
-    }
-}
-
-template <typename T>
-void im2col_nhwc_channel_first(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
-{
-    ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NHWC);
     const int stride_x      = conv_info.stride().first;
     const int stride_y      = conv_info.stride().second;
     const int kernel_width  = kernel_dims.width;
@@ -185,7 +139,7 @@
 }
 
 template <typename T>
-void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups, bool channels_first_output_nhwc)
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups)
 {
     switch(src.data_layout())
     {
@@ -196,14 +150,7 @@
         }
         case DataLayout::NHWC:
         {
-            if(channels_first_output_nhwc)
-            {
-                im2col_nhwc_channel_first(src, dst, kernel_dims, conv_info, has_bias);
-            }
-            else
-            {
-                im2col_nhwc(src, dst, kernel_dims, conv_info, has_bias);
-            }
+            im2col_nhwc(src, dst, kernel_dims, conv_info, has_bias);
             break;
         }
         default:
@@ -214,12 +161,9 @@
     }
 }
 
-template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
-                     bool channels_first_output_nhwc);
-template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
-                     bool channels_first_output_nhwc);
-template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
-                     bool channels_first_output_nhwc);
+template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups);
+template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups);
+template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Im2Col.h b/tests/validation/reference/Im2Col.h
index 84ee237..f519d0e 100644
--- a/tests/validation/reference/Im2Col.h
+++ b/tests/validation/reference/Im2Col.h
@@ -35,8 +35,7 @@
 namespace reference
 {
 template <typename T>
-void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups,
-            bool channels_first_output_nhwc = false);
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/L2NormalizeLayer.cpp b/tests/validation/reference/L2NormalizeLayer.cpp
index 99f4e8a..fcd6226 100644
--- a/tests/validation/reference/L2NormalizeLayer.cpp
+++ b/tests/validation/reference/L2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,24 +57,38 @@
     SimpleTensor<T> sum = reduction_operation(src, get_output_shape(src.shape(), axis), axis, ReductionOperation::SUM_SQUARE);
 
     // Compute reference
-    const int elems      = src.shape()[axis];
-    const int upper_dims = src.shape().total_size_upper(axis + 1);
+    const int upper_dims     = src.shape().total_size_upper(axis + 1);
+    const int lower_dims     = src.shape().total_size_lower(axis + 1);
+    const int lower_dims_sum = sum.shape().total_size_lower(axis + 1);
 
     for(int du = 0; du < upper_dims; ++du)
     {
-        if(axis == 0)
+        const T *src_row_ptr = src.data() + du * lower_dims;
+        T       *dst_row_ptr = dst.data() + du * lower_dims;
+        switch(axis)
         {
-            const T *src_row_ptr         = src.data() + du * elems;
-            T       *dst_row_ptr         = dst.data() + du * elems;
-            const T  normalization_value = std::sqrt(std::max(sum[du], epsilon));
-            std::transform(src_row_ptr, src_row_ptr + elems, dst_row_ptr, [normalization_value](T val)
+            case 0:
             {
-                return val / normalization_value;
-            });
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("Unsupported normalization axis");
+                const int elems               = src.shape()[0];
+                const T   normalization_value = sqrt(std::max(sum[du], static_cast<T>(epsilon)));
+                std::transform(src_row_ptr, src_row_ptr + elems, dst_row_ptr, [normalization_value](T val)
+                {
+                    return val / normalization_value;
+                });
+            }
+            break;
+            case 1:
+            case 2:
+            {
+                for(int ld = 0; ld < lower_dims; ++ld)
+                {
+                    const T normalization_value = sqrt(std::max(sum[ld % lower_dims_sum + du * lower_dims_sum], static_cast<T>(epsilon)));
+                    dst_row_ptr[ld]             = src_row_ptr[ld] / normalization_value;
+                }
+            }
+            break;
+            default:
+                ARM_COMPUTE_ERROR("Axis not supported");
         }
     }
 
@@ -82,6 +96,7 @@
 }
 
 template SimpleTensor<float> l2_normalize(const SimpleTensor<float> &src, unsigned int axis, float epsilon);
+template SimpleTensor<half> l2_normalize(const SimpleTensor<half> &src, unsigned int axis, float epsilon);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/LaplacianPyramid.cpp b/tests/validation/reference/LaplacianPyramid.cpp
index 5668474..21ddc1e 100644
--- a/tests/validation/reference/LaplacianPyramid.cpp
+++ b/tests/validation/reference/LaplacianPyramid.cpp
@@ -23,7 +23,7 @@
  */
 #include "LaplacianPyramid.h"
 
-#include "tests/validation/reference/ArithmeticSubtraction.h"
+#include "tests/validation/reference/ArithmeticOperations.h"
 #include "tests/validation/reference/DepthConvertLayer.h"
 #include "tests/validation/reference/Gaussian5x5.h"
 #include "tests/validation/reference/GaussianPyramidHalf.h"
@@ -53,7 +53,10 @@
         const SimpleTensor<T> level_filtered = reference::gaussian5x5(gaussian_level_pyramid[i], border_mode, constant_border_value);
         pyramid_conv.push_back(level_filtered);
 
-        const SimpleTensor<U> level_sub = reference::arithmetic_subtraction<T, T, U>(gaussian_level_pyramid[i], level_filtered, dst.data_type(), ConvertPolicy::WRAP);
+        const SimpleTensor<U> level_filtered_converted = depth_convert<T, U>(level_filtered, DataType::S16, ConvertPolicy::WRAP, 0);
+        const SimpleTensor<U> gaussian_level_converted = depth_convert<T, U>(gaussian_level_pyramid[i], DataType::S16, ConvertPolicy::WRAP, 0);
+
+        const SimpleTensor<U> level_sub = reference::arithmetic_operation<U>(reference::ArithmeticOperation::SUB, gaussian_level_converted, level_filtered_converted, dst.data_type(), ConvertPolicy::WRAP);
         pyramid_dst.push_back(level_sub);
     }
 
diff --git a/tests/validation/reference/LaplacianReconstruct.cpp b/tests/validation/reference/LaplacianReconstruct.cpp
index 2346828..ef14355 100644
--- a/tests/validation/reference/LaplacianReconstruct.cpp
+++ b/tests/validation/reference/LaplacianReconstruct.cpp
@@ -24,7 +24,7 @@
 #include "LaplacianReconstruct.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/reference/ArithmeticAddition.h"
+#include "tests/validation/reference/ArithmeticOperations.h"
 #include "tests/validation/reference/DepthConvertLayer.h"
 #include "tests/validation/reference/Scale.h"
 
@@ -45,7 +45,7 @@
     const DataType data_type  = low_res.data_type();
 
     // input + L(n-1)
-    tmp_pyramid[last_level] = reference::arithmetic_addition(low_res, pyramid[last_level], data_type, ConvertPolicy::SATURATE);
+    tmp_pyramid[last_level] = reference::arithmetic_operation(reference::ArithmeticOperation::ADD, low_res, pyramid[last_level], data_type, ConvertPolicy::SATURATE);
 
     // Scale levels n-1 to 1, and add levels n-2 to 0
     for(size_t i = last_level; i-- > 0;)
@@ -56,7 +56,7 @@
         tmp_pyramid[i] = reference::scale(tmp_pyramid[i + 1], scale_x, scale_y, InterpolationPolicy::NEAREST_NEIGHBOR,
                                           border_mode, constant_border_value, SamplingPolicy::CENTER, false);
 
-        tmp_pyramid[i] = reference::arithmetic_addition(tmp_pyramid[i], pyramid[i], data_type, ConvertPolicy::SATURATE);
+        tmp_pyramid[i] = reference::arithmetic_operation(reference::ArithmeticOperation::ADD, tmp_pyramid[i], pyramid[i], data_type, ConvertPolicy::SATURATE);
     }
 
     return reference::depth_convert<T, U>(tmp_pyramid[0], DataType::U8, ConvertPolicy::SATURATE, 0);
diff --git a/tests/validation/reference/NormalizationLayer.cpp b/tests/validation/reference/NormalizationLayer.cpp
index 2ae68c6..e6ca233 100644
--- a/tests/validation/reference/NormalizationLayer.cpp
+++ b/tests/validation/reference/NormalizationLayer.cpp
@@ -33,7 +33,7 @@
 {
 namespace reference
 {
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+template <typename T>
 SimpleTensor<T> normalization_layer(const SimpleTensor<T> &src, NormalizationLayerInfo info)
 {
     // Create reference
diff --git a/tests/validation/reference/NormalizationLayer.h b/tests/validation/reference/NormalizationLayer.h
index 3448baf..81b95cc 100644
--- a/tests/validation/reference/NormalizationLayer.h
+++ b/tests/validation/reference/NormalizationLayer.h
@@ -35,7 +35,7 @@
 {
 namespace reference
 {
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+template <typename T>
 SimpleTensor<T> normalization_layer(const SimpleTensor<T> &src, NormalizationLayerInfo info);
 
 } // namespace reference
diff --git a/tests/validation/reference/NormalizePlanarYUVLayer.cpp b/tests/validation/reference/NormalizePlanarYUVLayer.cpp
index 2442943..563e2a7 100644
--- a/tests/validation/reference/NormalizePlanarYUVLayer.cpp
+++ b/tests/validation/reference/NormalizePlanarYUVLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,8 @@
 namespace reference
 {
 // NormalizePlanarYUV Layer for floating point type
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type *>
-SimpleTensor<T> normalize_planar_yuv_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &mean, const SimpleTensor<T> &sd)
+template <typename T>
+SimpleTensor<T> normalize_planar_yuv_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &mean, const SimpleTensor<T> &std)
 {
     SimpleTensor<T> result(src.shape(), src.data_type());
 
@@ -53,7 +53,7 @@
                 for(int l = 0; l < cols; ++l)
                 {
                     const int pos = l + k * cols + i * rows * cols + r * cols * rows * depth;
-                    result[pos]   = (src[pos] - mean[i]) / sd[i];
+                    result[pos]   = (src[pos] - mean[i]) / std[i];
                 }
             }
         }
@@ -61,8 +61,19 @@
     return result;
 }
 
-template SimpleTensor<half> normalize_planar_yuv_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &mean, const SimpleTensor<half> &sd);
+template <>
+SimpleTensor<uint8_t> normalize_planar_yuv_layer<uint8_t>(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &mean, const SimpleTensor<uint8_t> &std)
+{
+    SimpleTensor<float>   src_tmp  = convert_from_asymmetric(src);
+    SimpleTensor<float>   mean_tmp = convert_from_asymmetric(mean);
+    SimpleTensor<float>   std_tmp  = convert_from_asymmetric(std);
+    SimpleTensor<float>   dst_tmp  = normalize_planar_yuv_layer<float>(src_tmp, mean_tmp, std_tmp);
+    SimpleTensor<uint8_t> dst      = convert_to_asymmetric(dst_tmp, src.quantization_info());
+    return dst;
+}
 
+template SimpleTensor<half> normalize_planar_yuv_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &mean, const SimpleTensor<half> &std);
+template SimpleTensor<float> normalize_planar_yuv_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &mean, const SimpleTensor<float> &std);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/NormalizePlanarYUVLayer.h b/tests/validation/reference/NormalizePlanarYUVLayer.h
index c8740a3..95488f9 100644
--- a/tests/validation/reference/NormalizePlanarYUVLayer.h
+++ b/tests/validation/reference/NormalizePlanarYUVLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,8 @@
 {
 namespace reference
 {
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type * = nullptr>
-SimpleTensor<T> normalize_planar_yuv_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &mean, const SimpleTensor<T> &sd);
+template <typename T>
+SimpleTensor<T> normalize_planar_yuv_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &mean, const SimpleTensor<T> &std);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/PadLayer.cpp b/tests/validation/reference/PadLayer.cpp
new file mode 100644
index 0000000..0a3b38d
--- /dev/null
+++ b/tests/validation/reference/PadLayer.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PadLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> pad_layer(const SimpleTensor<T> &src, const PaddingList &paddings)
+{
+    DataType dst_data_type = src.data_type();
+
+    TensorShape orig_shape = src.shape();
+
+    std::vector<PaddingInfo> paddings_extended = paddings;
+
+    for(size_t i = paddings.size(); i < TensorShape::num_max_dimensions; i++)
+    {
+        paddings_extended.emplace_back(PaddingInfo{ 0, 0 });
+    }
+
+    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(orig_shape, paddings);
+
+    SimpleTensor<T> dst(padded_shape, dst_data_type);
+
+    // Reference algorithm: loop over the different dimension of the input.
+    for(int idx = 0; idx < dst.num_elements(); idx++)
+    {
+        Coordinates coord = index2coord(padded_shape, idx);
+
+        const size_t i = coord.x();
+        const size_t j = coord.y();
+        const size_t k = coord.z();
+        const size_t l = coord[3];
+        const size_t m = coord[4];
+        const size_t n = coord[5];
+
+        std::array<size_t, TensorShape::num_max_dimensions> dims   = { 0, 1, 2, 3, 4, 5 };
+        std::array<size_t, TensorShape::num_max_dimensions> coords = { i, j, k, l, m, n };
+        auto is_padding_area = [&](size_t i)
+        {
+            return (coords[i] < paddings_extended[i].first || coords[i] > orig_shape[i] + paddings_extended[i].first - 1);
+        };
+
+        // If the tuple [i,j,k,l,m] is in the padding area, then seimply set the value
+        if(std::any_of(dims.begin(), dims.end(), is_padding_area))
+        {
+            dst[idx] = T(0);
+        }
+        else
+        {
+            // If the tuple[i,j,k,l,m] is not in the padding area, then copy the input into the output
+
+            Coordinates orig_coords{ i - paddings_extended[0].first,
+                                     j - paddings_extended[1].first,
+                                     k - paddings_extended[2].first,
+                                     l - paddings_extended[3].first,
+                                     m - paddings_extended[4].first,
+                                     n - paddings_extended[5].first };
+
+            const size_t idx_src = coord2index(orig_shape, orig_coords);
+            dst[idx]             = src[idx_src];
+        }
+    }
+
+    return dst;
+}
+
+template SimpleTensor<float> pad_layer(const SimpleTensor<float> &src, const PaddingList &paddings);
+template SimpleTensor<half> pad_layer(const SimpleTensor<half> &src, const PaddingList &paddings);
+template SimpleTensor<uint32_t> pad_layer(const SimpleTensor<uint32_t> &src, const PaddingList &paddings);
+template SimpleTensor<uint8_t> pad_layer(const SimpleTensor<uint8_t> &src, const PaddingList &paddings);
+template SimpleTensor<int8_t> pad_layer(const SimpleTensor<int8_t> &src, const PaddingList &paddings);
+template SimpleTensor<uint16_t> pad_layer(const SimpleTensor<uint16_t> &src, const PaddingList &paddings);
+template SimpleTensor<int16_t> pad_layer(const SimpleTensor<int16_t> &src, const PaddingList &paddings);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticAddition.h b/tests/validation/reference/PadLayer.h
similarity index 71%
copy from tests/validation/reference/ArithmeticAddition.h
copy to tests/validation/reference/PadLayer.h
index faeabd7..9406b05 100644
--- a/tests/validation/reference/ArithmeticAddition.h
+++ b/tests/validation/reference/PadLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__
+#ifndef __ARM_COMPUTE_TEST_PADLAYER_H__
+#define __ARM_COMPUTE_TEST_PADLAYER_H__
 
+#include "arm_compute/core/Types.h"
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
 
@@ -35,14 +36,18 @@
 {
 namespace reference
 {
+/** Reference function to pad an ND tensor. This function is not supposed to be optimized, but to
+ * clearly and naively execute the padding of a tensor
+ *
+ * @param[in] src      Tensor to pad
+ * @param[in] paddings Padding size in each dimension
+ *
+ * @return The padded Tensor
+ */
 template <typename T>
-SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst, ConvertPolicy convert_policy);
-
-template <typename T>
-SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-
+SimpleTensor<T> pad_layer(const SimpleTensor<T> &src, const PaddingList &paddings);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__ */
+#endif /* __ARM_COMPUTE_TEST_PADLAYER_H__ */
diff --git a/tests/validation/reference/PixelWiseMultiplication.cpp b/tests/validation/reference/PixelWiseMultiplication.cpp
index 859da5c..d86f8aa 100644
--- a/tests/validation/reference/PixelWiseMultiplication.cpp
+++ b/tests/validation/reference/PixelWiseMultiplication.cpp
@@ -23,6 +23,8 @@
  */
 #include "PixelWiseMultiplication.h"
 
+#include "tests/validation/Helpers.h"
+
 namespace arm_compute
 {
 namespace test
@@ -126,8 +128,10 @@
 } // namespace
 
 template <typename T1, typename T2>
-SimpleTensor<T2> pixel_wise_multiplication(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+SimpleTensor<T2> pixel_wise_multiplication(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, QuantizationInfo qout)
 {
+    ARM_COMPUTE_UNUSED(qout);
+
     SimpleTensor<T2> dst(TensorShape::broadcast_shape(src1.shape(), src2.shape()), src2.data_type());
 
     if(scale < 0)
@@ -142,13 +146,37 @@
     return dst;
 }
 
+template <>
+SimpleTensor<uint8_t> pixel_wise_multiplication(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
+                                                QuantizationInfo qout)
+{
+    SimpleTensor<uint8_t> dst(TensorShape::broadcast_shape(src1.shape(), src2.shape()), src2.data_type(), 1, qout);
+
+    if(src1.data_type() == DataType::QASYMM8 && src2.data_type() == DataType::QASYMM8)
+    {
+        SimpleTensor<float> src1_tmp = convert_from_asymmetric(src1);
+        SimpleTensor<float> src2_tmp = convert_from_asymmetric(src2);
+        SimpleTensor<float> dst_tmp  = pixel_wise_multiplication<float>(src1_tmp, src2_tmp, scale, convert_policy, rounding_policy, qout);
+        dst                          = convert_to_asymmetric(dst_tmp, qout);
+    }
+    else
+    {
+        if(scale < 0)
+        {
+            ARM_COMPUTE_ERROR("Scale of pixel-wise multiplication must be non-negative");
+        }
+
+        Coordinates id_src1, id_src2, id_dst;
+        BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(src1, src2, dst, scale, convert_policy, rounding_policy, id_src1, id_src2, id_dst);
+    }
+    return dst;
+}
 // *INDENT-OFF*
 // clang-format off
-template SimpleTensor<uint8_t> pixel_wise_multiplication(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy);
-template SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<uint8_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy);
-template SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy);
-template SimpleTensor<float> pixel_wise_multiplication(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy);
-template SimpleTensor<half_float::half> pixel_wise_multiplication(const SimpleTensor<half_float::half> &src1, const SimpleTensor<half_float::half> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy);
+template SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<uint8_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, QuantizationInfo qout);
+template SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, QuantizationInfo qout);
+template SimpleTensor<float> pixel_wise_multiplication(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, QuantizationInfo qout);
+template SimpleTensor<half_float::half> pixel_wise_multiplication(const SimpleTensor<half_float::half> &src1, const SimpleTensor<half_float::half> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, QuantizationInfo qout);
 // clang-format on
 // *INDENT-ON*
 } // namespace reference
diff --git a/tests/validation/reference/PixelWiseMultiplication.h b/tests/validation/reference/PixelWiseMultiplication.h
index 1dce154..787a7b2 100644
--- a/tests/validation/reference/PixelWiseMultiplication.h
+++ b/tests/validation/reference/PixelWiseMultiplication.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,8 @@
 namespace reference
 {
 template <typename T1, typename T2>
-SimpleTensor<T2> pixel_wise_multiplication(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy);
+SimpleTensor<T2> pixel_wise_multiplication(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, float scale,
+                                           ConvertPolicy convert_policy, RoundingPolicy rounding_policy, QuantizationInfo qout = QuantizationInfo());
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index 02c430a..e617c93 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -37,7 +37,7 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+template <typename T>
 SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON(info.is_global_pooling() && (src.shape().x() != src.shape().y()));
diff --git a/tests/validation/reference/PoolingLayer.h b/tests/validation/reference/PoolingLayer.h
index b0d30af..0097789 100644
--- a/tests/validation/reference/PoolingLayer.h
+++ b/tests/validation/reference/PoolingLayer.h
@@ -35,10 +35,7 @@
 {
 namespace reference
 {
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info);
-
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+template <typename T>
 SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/PriorBoxLayer.cpp b/tests/validation/reference/PriorBoxLayer.cpp
new file mode 100644
index 0000000..0fd4a8a
--- /dev/null
+++ b/tests/validation/reference/PriorBoxLayer.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PriorBoxLayer.h"
+
+#include "ActivationLayer.h"
+
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> prior_box_layer(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, const PriorBoxLayerInfo &info, const TensorShape &output_shape)
+{
+    const auto layer_width  = static_cast<int>(src1.shape()[0]);
+    const auto layer_height = static_cast<int>(src1.shape()[1]);
+
+    int img_width  = info.img_size().x;
+    int img_height = info.img_size().y;
+    if(img_width == 0 || img_height == 0)
+    {
+        img_width  = static_cast<int>(src2.shape()[0]);
+        img_height = static_cast<int>(src2.shape()[1]);
+    }
+
+    float step_x = info.steps()[0];
+    float step_y = info.steps()[1];
+    if(step_x == 0.f || step_y == 0.f)
+    {
+        step_x = static_cast<float>(img_width) / layer_width;
+        step_x = static_cast<float>(img_height) / layer_height;
+    }
+
+    // Calculate number of aspect ratios
+    const int num_priors     = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+    const int total_elements = layer_width * layer_height * num_priors * 4;
+
+    SimpleTensor<T> result(output_shape, src1.data_type());
+
+    int idx = 0;
+    for(int y = 0; y < layer_height; ++y)
+    {
+        for(int x = 0; x < layer_width; ++x)
+        {
+            const float center_x = (x + info.offset()) * step_x;
+            const float center_y = (y + info.offset()) * step_y;
+            float       box_width;
+            float       box_height;
+            for(unsigned int i = 0; i < info.min_sizes().size(); ++i)
+            {
+                const float min_size = info.min_sizes().at(i);
+                box_width            = min_size;
+                box_height           = min_size;
+                // (xmin, ymin, xmax, ymax)
+                result[idx++] = (center_x - box_width / 2.f) / img_width;
+                result[idx++] = (center_y - box_height / 2.f) / img_height;
+                result[idx++] = (center_x + box_width / 2.f) / img_width;
+                result[idx++] = (center_y + box_height / 2.f) / img_height;
+
+                if(!info.max_sizes().empty())
+                {
+                    const float max_size = info.max_sizes().at(i);
+                    box_width            = sqrt(min_size * max_size);
+                    box_height           = box_width;
+
+                    // (xmin, ymin, xmax, ymax)
+                    result[idx++] = (center_x - box_width / 2.f) / img_width;
+                    result[idx++] = (center_y - box_height / 2.f) / img_height;
+                    result[idx++] = (center_x + box_width / 2.f) / img_width;
+                    result[idx++] = (center_y + box_height / 2.f) / img_height;
+                }
+
+                // rest of priors
+                for(auto ar : info.aspect_ratios())
+                {
+                    if(fabs(ar - 1.) < 1e-6)
+                    {
+                        continue;
+                    }
+
+                    box_width  = min_size * sqrt(ar);
+                    box_height = min_size / sqrt(ar);
+
+                    // (xmin, ymin, xmax, ymax)
+                    result[idx++] = (center_x - box_width / 2.f) / img_width;
+                    result[idx++] = (center_y - box_height / 2.f) / img_height;
+                    result[idx++] = (center_x + box_width / 2.f) / img_width;
+                    result[idx++] = (center_y + box_height / 2.f) / img_height;
+                }
+            }
+        }
+    }
+
+    // clip the coordinates
+    if(info.clip())
+    {
+        for(int i = 0; i < total_elements; ++i)
+        {
+            result[i] = std::min<T>(std::max<T>(result[i], 0.f), 1.f);
+        }
+    }
+
+    // set the variance.
+    if(info.variances().size() == 1)
+    {
+        std::fill_n(result.data() + idx, total_elements, info.variances().at(0));
+    }
+    else
+    {
+        for(int h = 0; h < layer_height; ++h)
+        {
+            for(int w = 0; w < layer_width; ++w)
+            {
+                for(int i = 0; i < num_priors; ++i)
+                {
+                    for(int j = 0; j < 4; ++j)
+                    {
+                        result[idx++] = info.variances().at(j);
+                    }
+                }
+            }
+        }
+    }
+
+    return result;
+}
+template SimpleTensor<float> prior_box_layer(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, const PriorBoxLayerInfo &info, const TensorShape &output_shape);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticSubtraction.h b/tests/validation/reference/PriorBoxLayer.h
similarity index 76%
rename from tests/validation/reference/ArithmeticSubtraction.h
rename to tests/validation/reference/PriorBoxLayer.h
index 9308314..25e567f 100644
--- a/tests/validation/reference/ArithmeticSubtraction.h
+++ b/tests/validation/reference/PriorBoxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
+#ifndef __ARM_COMPUTE_TEST_PRIOR_BOX_LAYER_H__
+#define __ARM_COMPUTE_TEST_PRIOR_BOX_LAYER_H__
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -35,10 +35,10 @@
 {
 namespace reference
 {
-template <typename T1, typename T2, typename T3>
-SimpleTensor<T3> arithmetic_subtraction(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template <typename T>
+SimpleTensor<T> prior_box_layer(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, const PriorBoxLayerInfo &info, const TensorShape &output_shape);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__ */
+#endif /* __ARM_COMPUTE_TEST_PRIOR_BOX_LAYER_H__ */
diff --git a/tests/validation/reference/ROIAlignLayer.cpp b/tests/validation/reference/ROIAlignLayer.cpp
new file mode 100644
index 0000000..8a76983
--- /dev/null
+++ b/tests/validation/reference/ROIAlignLayer.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ROIAlignLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/validation/Helpers.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+/** Average pooling over an aligned window */
+template <typename T>
+inline T roi_align_1x1(const T *input, TensorShape input_shape,
+                       float region_start_x,
+                       float bin_size_x,
+                       int   grid_size_x,
+                       float region_end_x,
+                       float region_start_y,
+                       float bin_size_y,
+                       int   grid_size_y,
+                       float region_end_y,
+                       int   pz)
+{
+    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return T(0);
+    }
+    else
+    {
+        float avg = 0;
+        // Iterate through the aligned pooling region
+        for(int iy = 0; iy < grid_size_y; ++iy)
+        {
+            for(int ix = 0; ix < grid_size_x; ++ix)
+            {
+                // Align the window in the middle of every bin
+                float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
+                float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x);
+
+                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
+                const int y_low  = y;
+                const int x_low  = x;
+                const int y_high = y_low + 1;
+                const int x_high = x_low + 1;
+
+                const float ly = y - y_low;
+                const float lx = x - x_low;
+                const float hy = 1. - ly;
+                const float hx = 1. - lx;
+
+                const float w1 = hy * hx;
+                const float w2 = hy * lx;
+                const float w3 = ly * hx;
+                const float w4 = ly * lx;
+
+                const size_t idx1  = coord2index(input_shape, Coordinates(x_low, y_low, pz));
+                T            data1 = input[idx1];
+
+                const size_t idx2  = coord2index(input_shape, Coordinates(x_high, y_low, pz));
+                T            data2 = input[idx2];
+
+                const size_t idx3  = coord2index(input_shape, Coordinates(x_low, y_high, pz));
+                T            data3 = input[idx3];
+
+                const size_t idx4  = coord2index(input_shape, Coordinates(x_high, y_high, pz));
+                T            data4 = input[idx4];
+
+                avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+            }
+        }
+
+        avg /= grid_size_x * grid_size_y;
+
+        return T(avg);
+    }
+}
+
+/** Clamp the value between lower and upper */
+template <typename T>
+T clamp(T value, T lower, T upper)
+{
+    return std::max(lower, std::min(value, upper));
+}
+} // namespace
+template <typename T>
+SimpleTensor<T> roi_align_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &rois, const ROIPoolingLayerInfo &pool_info)
+{
+    const size_t values_per_roi = rois.shape()[0];
+    const size_t num_rois       = rois.shape()[1];
+    DataType     dst_data_type  = src.data_type();
+
+    const auto *rois_ptr = static_cast<const T *>(rois.data());
+
+    TensorShape     input_shape = src.shape();
+    TensorShape     output_shape(pool_info.pooled_width(), pool_info.pooled_height(), src.shape()[2], num_rois);
+    SimpleTensor<T> dst(output_shape, dst_data_type);
+
+    // Iterate over every pixel of the input image
+    for(size_t px = 0; px < pool_info.pooled_width(); ++px)
+    {
+        for(size_t py = 0; py < pool_info.pooled_height(); ++py)
+        {
+            for(size_t pw = 0; pw < num_rois; ++pw)
+            {
+                const unsigned int roi_batch = rois_ptr[values_per_roi * pw];
+                const auto         x1        = float(rois_ptr[values_per_roi * pw + 1]);
+                const auto         y1        = float(rois_ptr[values_per_roi * pw + 2]);
+                const auto         x2        = float(rois_ptr[values_per_roi * pw + 3]);
+                const auto         y2        = float(rois_ptr[values_per_roi * pw + 4]);
+
+                const float roi_anchor_x = x1 * pool_info.spatial_scale();
+                const float roi_anchor_y = y1 * pool_info.spatial_scale();
+                const float roi_dims_x   = std::max((x2 - x1) * pool_info.spatial_scale(), 1.0f);
+                const float roi_dims_y   = std::max((y2 - y1) * pool_info.spatial_scale(), 1.0f);
+
+                float bin_size_x     = roi_dims_x / pool_info.pooled_width();
+                float bin_size_y     = roi_dims_y / pool_info.pooled_height();
+                float region_start_x = px * bin_size_x + roi_anchor_x;
+                float region_start_y = py * bin_size_y + roi_anchor_y;
+                float region_end_x   = (px + 1) * bin_size_x + roi_anchor_x;
+                float region_end_y   = (py + 1) * bin_size_y + roi_anchor_y;
+
+                region_start_x = clamp(region_start_x, 0.0f, float(input_shape[0]));
+                region_start_y = clamp(region_start_y, 0.0f, float(input_shape[1]));
+                region_end_x   = clamp(region_end_x, 0.0f, float(input_shape[0]));
+                region_end_y   = clamp(region_end_y, 0.0f, float(input_shape[1]));
+
+                const int roi_bin_grid_x = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
+                const int roi_bin_grid_y = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
+
+                // Move input and output pointer across the fourth dimension
+                const size_t input_stride_w  = input_shape[0] * input_shape[1] * input_shape[2];
+                const size_t output_stride_w = output_shape[0] * output_shape[1] * output_shape[2];
+                const T     *input_ptr       = src.data() + roi_batch * input_stride_w;
+                T           *output_ptr      = dst.data() + px + py * output_shape[0] + pw * output_stride_w;
+
+                for(int pz = 0; pz < int(input_shape[2]); ++pz)
+                {
+                    // For every pixel pool over an aligned region
+                    *(output_ptr + pz * output_shape[0] * output_shape[1]) = roi_align_1x1(input_ptr, input_shape,
+                                                                                           region_start_x,
+                                                                                           bin_size_x,
+                                                                                           roi_bin_grid_x,
+                                                                                           region_end_x,
+                                                                                           region_start_y,
+                                                                                           bin_size_y,
+                                                                                           roi_bin_grid_y,
+                                                                                           region_end_y, pz);
+                }
+            }
+        }
+    }
+    return dst;
+}
+template SimpleTensor<float> roi_align_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &rois, const ROIPoolingLayerInfo &pool_info);
+template SimpleTensor<half> roi_align_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &rois, const ROIPoolingLayerInfo &pool_info);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticSubtraction.h b/tests/validation/reference/ROIAlignLayer.h
similarity index 76%
copy from tests/validation/reference/ArithmeticSubtraction.h
copy to tests/validation/reference/ROIAlignLayer.h
index 9308314..b67ff42 100644
--- a/tests/validation/reference/ArithmeticSubtraction.h
+++ b/tests/validation/reference/ROIAlignLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
+#ifndef __ARM_COMPUTE_TEST_ROIALIGNLAYER_H__
+#define __ARM_COMPUTE_TEST_ROIALIGNLAYER_H__
 
+#include "arm_compute/core/Types.h"
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
 
@@ -35,10 +36,10 @@
 {
 namespace reference
 {
-template <typename T1, typename T2, typename T3>
-SimpleTensor<T3> arithmetic_subtraction(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template <typename T>
+SimpleTensor<T> roi_align_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &rois, const ROIPoolingLayerInfo &pool_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__ */
+#endif /* __ARM_COMPUTE_TEST_ROIALIGNLAYER_H__ */
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index 871a761..2f103a6 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -39,24 +39,39 @@
 namespace
 {
 template <typename T>
-struct square
+T reduce_operation(T *ptr, int reduce_elements, ReductionOperation op, int stride)
 {
-    T operator()(const T &lhs, const T &rhs) const
-    {
-        return (lhs + rhs * rhs);
-    }
-};
+    using type = typename std::remove_cv<T>::type;
+    auto res   = type(0);
 
-template <typename T>
-T reduce_operation(T *ptr, int reduce_elements, ReductionOperation op)
-{
-    switch(op)
+    if(std::is_integral<type>::value)
     {
-        case ReductionOperation::SUM_SQUARE:
-            return std::accumulate(ptr, ptr + reduce_elements, static_cast<T>(0), square<T>());
-        default:
-            ARM_COMPUTE_ERROR("Unsupported reduction operation");
+        uint32_t int_res = 0;
+        for(int i = 0; i < reduce_elements; ++i)
+        {
+            auto elem = static_cast<uint32_t>(*(ptr + stride * i));
+            int_res += (op == ReductionOperation::SUM_SQUARE) ? elem * elem : elem;
+        }
+        if(op == ReductionOperation::MEAN_SUM && reduce_elements > 0)
+        {
+            int_res /= reduce_elements;
+        }
+        res = saturate_cast<type>(int_res);
     }
+    else
+    {
+        for(int i = 0; i < reduce_elements; ++i)
+        {
+            auto elem = *(ptr + stride * i);
+            res += (op == ReductionOperation::SUM_SQUARE) ? elem * elem : elem;
+        }
+        if(op == ReductionOperation::MEAN_SUM && reduce_elements > 0)
+        {
+            res /= reduce_elements;
+        }
+    }
+
+    return res;
 }
 } // namespace
 
@@ -64,23 +79,85 @@
 SimpleTensor<T> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op)
 {
     // Create reference
-    SimpleTensor<T> dst{ dst_shape, src.data_type() };
+    SimpleTensor<T>    dst{ dst_shape, src.data_type(), 1, src.quantization_info() };
+    const unsigned int src_width    = src.shape().x();
+    const unsigned int src_height   = src.shape().y();
+    const unsigned int src_depth    = src.shape().z();
+    const unsigned int src_batch    = src.shape()[3];
+    const int          reduce_elems = src.shape()[axis];
 
-    // Compute reference
-    const int reduce_elems = src.shape()[axis];
-    const int upper_dims   = src.shape().total_size_upper(axis + 1);
-
-    for(int du = 0; du < upper_dims; ++du)
+    switch(axis)
     {
-        if(axis == 0)
+        case 0:
         {
-            const T *src_row_ptr = src.data() + du * reduce_elems;
-            dst[du]              = reduce_operation(src_row_ptr, reduce_elems, op);
+            const unsigned int upper_dims = src.shape().total_size_upper(1);
+            for(unsigned int du = 0; du < upper_dims; ++du)
+            {
+                const T *src_row_ptr = src.data() + du * reduce_elems;
+                auto     res         = reduce_operation(src_row_ptr, reduce_elems, op, 1);
+                dst[du]              = res;
+            }
         }
-        else
+        break;
+        case 1:
         {
+            const unsigned int upper_dims = src.shape().total_size_upper(2);
+            for(unsigned int du = 0; du < upper_dims; ++du)
+            {
+                for(unsigned int x = 0; x < src_width; ++x)
+                {
+                    const int in_offset   = du * src_height * src_width + x;
+                    const int out_offset  = du * src_width + x;
+                    const T *src_row_ptr = src.data() + in_offset;
+                    auto      res         = reduce_operation(src_row_ptr, reduce_elems, op, src_width);
+                    dst[out_offset]       = res;
+                }
+            }
+        }
+        break;
+        case 2:
+        {
+            const unsigned int upper_dims = src.shape().total_size_upper(3);
+            for(unsigned int du = 0; du < upper_dims; ++du)
+            {
+                for(unsigned int x = 0; x < src_width; ++x)
+                {
+                    for(unsigned int y = 0; y < src_height; ++y)
+                    {
+                        const int in_offset   = du * src_depth * src_height * src_width + y * src_width + x;
+                        const int out_offset  = du * src_width * src_height + y * src_width + x;
+                        const T *src_row_ptr = src.data() + in_offset;
+                        auto      res         = reduce_operation(src_row_ptr, reduce_elems, op, src_height * src_width);
+                        dst[out_offset]       = res;
+                    }
+                }
+            }
+        }
+        break;
+        case 3:
+        {
+            const unsigned int upper_dims = src.shape().total_size_upper(4);
+            for(unsigned int du = 0; du < upper_dims; ++du)
+            {
+                for(unsigned int z = 0; z < src_depth; ++z)
+                {
+                    for(unsigned int y = 0; y < src_height; ++y)
+                    {
+                        for(unsigned int x = 0; x < src_width; ++x)
+                        {
+                            const int in_offset   = du * src_batch * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x;
+                            const int out_offset  = du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x;
+                            const T *src_row_ptr = src.data() + in_offset;
+                            auto      res         = reduce_operation(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth);
+                            dst[out_offset]       = res;
+                        }
+                    }
+                }
+            }
+        }
+        break;
+        default:
             ARM_COMPUTE_ERROR("Unsupported reduction axis");
-        }
     }
 
     return dst;
@@ -88,6 +165,7 @@
 
 template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ReductionOperation.h b/tests/validation/reference/ReductionOperation.h
index 6da6436..859b57a 100644
--- a/tests/validation/reference/ReductionOperation.h
+++ b/tests/validation/reference/ReductionOperation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/reference/ReorgLayer.cpp b/tests/validation/reference/ReorgLayer.cpp
new file mode 100644
index 0000000..2eb5d01
--- /dev/null
+++ b/tests/validation/reference/ReorgLayer.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ReorgLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> reorg_layer(const SimpleTensor<T> &src, int32_t stride)
+{
+    ARM_COMPUTE_ERROR_ON(src.shape().num_dimensions() > 4);
+    ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NCHW);
+
+    TensorInfo        input_info(src.shape(), 1, src.data_type());
+    const TensorShape output_shape = misc::shape_calculator::compute_reorg_output_shape(input_info, stride);
+
+    // Create destination tensor
+    SimpleTensor<T> dst{ output_shape, src.data_type() };
+
+    const unsigned int W          = dst.shape().x();
+    const unsigned int H          = dst.shape().y();
+    const unsigned int C          = dst.shape().z();
+    const unsigned int out_c      = C / (stride * stride);
+    const unsigned int outer_dims = dst.shape().total_size() / (W * H * C);
+
+    // Calculate layer reorg in NCHW
+    Coordinates map_coords;
+    for(unsigned int b = 0; b < outer_dims; ++b)
+    {
+        map_coords.set(3, b);
+        for(unsigned int c = 0; c < C; ++c)
+        {
+            map_coords.set(2, c % out_c);
+            const unsigned int offset = c / out_c;
+            for(unsigned int h = 0; h < H; ++h)
+            {
+                map_coords.set(1, h * stride + offset / stride);
+                for(unsigned int w = 0; w < W; ++w)
+                {
+                    const unsigned int dst_idx = w + W * (h + H * (c + C * b));
+                    map_coords.set(0, w * stride + offset % stride);
+                    dst[dst_idx] = *reinterpret_cast<const T *>(src(map_coords));
+                }
+            }
+        }
+    }
+
+    return dst;
+}
+
+template SimpleTensor<int32_t> reorg_layer(const SimpleTensor<int32_t> &src, int32_t stride);
+template SimpleTensor<int16_t> reorg_layer(const SimpleTensor<int16_t> &src, int32_t stride);
+template SimpleTensor<int8_t> reorg_layer(const SimpleTensor<int8_t> &src, int32_t stride);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticSubtraction.h b/tests/validation/reference/ReorgLayer.h
similarity index 74%
copy from tests/validation/reference/ArithmeticSubtraction.h
copy to tests/validation/reference/ReorgLayer.h
index 9308314..c3f42f4 100644
--- a/tests/validation/reference/ArithmeticSubtraction.h
+++ b/tests/validation/reference/ReorgLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
+#ifndef __ARM_COMPUTE_TEST_REORG_LAYER_H__
+#define __ARM_COMPUTE_TEST_REORG_LAYER_H__
 
 #include "tests/SimpleTensor.h"
-#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -35,10 +34,10 @@
 {
 namespace reference
 {
-template <typename T1, typename T2, typename T3>
-SimpleTensor<T3> arithmetic_subtraction(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template <typename T>
+SimpleTensor<T> reorg_layer(const SimpleTensor<T> &src, int32_t stride);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__ */
+#endif /* __ARM_COMPUTE_TEST_REORG_LAYER_H__ */
diff --git a/tests/validation/reference/ReshapeLayer.cpp b/tests/validation/reference/ReshapeLayer.cpp
index 42f06e4..85bf3fc 100644
--- a/tests/validation/reference/ReshapeLayer.cpp
+++ b/tests/validation/reference/ReshapeLayer.cpp
@@ -33,6 +33,7 @@
 {
 namespace reference
 {
+/** [ReshapeLayer] **/
 template <typename T>
 SimpleTensor<T> reshape_layer(const SimpleTensor<T> &src, const TensorShape &output_shape)
 {
@@ -51,6 +52,7 @@
 template SimpleTensor<int32_t> reshape_layer(const SimpleTensor<int32_t> &src, const TensorShape &output_shape);
 template SimpleTensor<half> reshape_layer(const SimpleTensor<half> &src, const TensorShape &output_shape);
 template SimpleTensor<float> reshape_layer(const SimpleTensor<float> &src, const TensorShape &output_shape);
+/** [ReshapeLayer] **/
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ReshapeLayer.h b/tests/validation/reference/ReshapeLayer.h
index fc6c716..9e42f80 100644
--- a/tests/validation/reference/ReshapeLayer.h
+++ b/tests/validation/reference/ReshapeLayer.h
@@ -34,8 +34,11 @@
 {
 namespace reference
 {
+/** [ReshapeLayer] **/
 template <typename T>
 SimpleTensor<T> reshape_layer(const SimpleTensor<T> &src, const TensorShape &output_shape);
+/** [ReshapeLayer] **/
+
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Scale.cpp b/tests/validation/reference/Scale.cpp
index f8a8b88..2f7bf2d 100644
--- a/tests/validation/reference/Scale.cpp
+++ b/tests/validation/reference/Scale.cpp
@@ -37,8 +37,8 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> scale(const SimpleTensor<T> &in, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value,
-                      SamplingPolicy sampling_policy, bool ceil_policy_scale)
+SimpleTensor<T> scale_core(const SimpleTensor<T> &in, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value,
+                           SamplingPolicy sampling_policy, bool ceil_policy_scale)
 {
     // Add 1 if ceil_policy_scale is true
     const size_t round_value = ceil_policy_scale ? 1U : 0U;
@@ -168,8 +168,32 @@
     return out;
 }
 
-template SimpleTensor<uint8_t> scale(const SimpleTensor<uint8_t> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value,
-                                     SamplingPolicy sampling_policy, bool ceil_policy_scale);
+template <typename T>
+SimpleTensor<T> scale(const SimpleTensor<T> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value,
+                      SamplingPolicy sampling_policy, bool ceil_policy_scale)
+{
+    return scale_core<T>(src, scale_x, scale_y, policy, border_mode, constant_border_value, sampling_policy, ceil_policy_scale);
+}
+
+template <>
+SimpleTensor<uint8_t> scale(const SimpleTensor<uint8_t> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value,
+                            SamplingPolicy sampling_policy, bool ceil_policy_scale)
+{
+    SimpleTensor<uint8_t> dst;
+    if(src.quantization_info().scale != 0.f)
+    {
+        SimpleTensor<float> src_tmp                 = convert_from_asymmetric(src);
+        float               constant_border_value_f = scvt_f32_qasymm8(constant_border_value, src.quantization_info().scale, src.quantization_info().offset);
+        SimpleTensor<float> dst_tmp                 = scale_core<float>(src_tmp, scale_x, scale_y, policy, border_mode, constant_border_value_f, sampling_policy, ceil_policy_scale);
+        dst                                         = convert_to_asymmetric(dst_tmp, src.quantization_info());
+    }
+    else
+    {
+        dst = scale_core<uint8_t>(src, scale_x, scale_y, policy, border_mode, constant_border_value, sampling_policy, ceil_policy_scale);
+    }
+    return dst;
+}
+
 template SimpleTensor<int16_t> scale(const SimpleTensor<int16_t> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, int16_t constant_border_value,
                                      SamplingPolicy sampling_policy, bool ceil_policy_scale);
 template SimpleTensor<half> scale(const SimpleTensor<half> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, half constant_border_value,
diff --git a/tests/validation/reference/Scale.h b/tests/validation/reference/Scale.h
index 566e30a..66267eb 100644
--- a/tests/validation/reference/Scale.h
+++ b/tests/validation/reference/Scale.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_TEST_SCALE_H__
 
 #include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -35,7 +36,7 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> scale(const SimpleTensor<T> &in, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value = 0,
+SimpleTensor<T> scale(const SimpleTensor<T> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value = 0,
                       SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool ceil_policy_scale = false);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/SliceOperations.cpp b/tests/validation/reference/SliceOperations.cpp
new file mode 100644
index 0000000..04b5b98
--- /dev/null
+++ b/tests/validation/reference/SliceOperations.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "SliceOperations.h"
+
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> slice(const SimpleTensor<T> &src, Coordinates starts, Coordinates ends)
+{
+    using namespace arm_compute::helpers::tensor_transform;
+
+    // Validation checks
+    ARM_COMPUTE_ERROR_ON(src.shape().num_dimensions() > 4);
+    ARM_COMPUTE_ERROR_ON(starts.num_dimensions() > src.shape().num_dimensions());
+    ARM_COMPUTE_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
+    {
+        return i < 0;
+    }));
+    ARM_COMPUTE_ERROR_ON(ends.num_dimensions() > src.shape().num_dimensions());
+
+    // Get source shape
+    const TensorShape &src_shape = src.shape();
+
+    // Get actual end
+    Coordinates ends_abs = slice_absolute_end_coords(src_shape, ends);
+
+    // Get destination shape
+    TensorShape dst_shape = compute_slice_output_shape(src_shape, starts, ends_abs);
+
+    // Create destination tensor
+    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1 };
+
+    // Perform slice
+    Window win;
+    win.use_tensor_dimensions(dst_shape);
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        Coordinates offset;
+        for(unsigned int i = 0; i < id.num_dimensions(); ++i)
+        {
+            offset.set(i, starts[i] + id[i]);
+        }
+        *reinterpret_cast<T *>(dst(id)) = *reinterpret_cast<const T *>(src(offset));
+    });
+
+    return dst;
+}
+
+template SimpleTensor<float> slice(const SimpleTensor<float> &src, Coordinates starts, Coordinates ends);
+template SimpleTensor<half_float::half> slice(const SimpleTensor<half_float::half> &src, Coordinates starts, Coordinates ends);
+
+template <typename T>
+SimpleTensor<T> strided_slice(const SimpleTensor<T> &src,
+                              Coordinates starts, Coordinates ends, BiStrides strides,
+                              int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    using namespace arm_compute::helpers::tensor_transform;
+
+    // Validation checks
+    ARM_COMPUTE_ERROR_ON(src.shape().num_dimensions() > 4);
+    ARM_COMPUTE_ERROR_ON(starts.num_dimensions() > src.shape().num_dimensions());
+    ARM_COMPUTE_ERROR_ON(ends.num_dimensions() > src.shape().num_dimensions());
+    ARM_COMPUTE_ERROR_ON(strides.num_dimensions() > src.shape().num_dimensions());
+    ARM_COMPUTE_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
+    {
+        return i == 0;
+    }));
+
+    // Get source shape
+    const TensorShape &src_shape = src.shape();
+
+    // Get actual start, end coordinates and strides
+    const Coordinates final_strides = strided_slice_strides(src_shape, strides);
+    const Coordinates starts_abs    = strided_slice_absolute_start_coords(src_shape, starts, final_strides, begin_mask);
+    const Coordinates ends_abs      = strided_slice_absolute_end_coords(src_shape, starts_abs, ends, final_strides, end_mask, shrink_axis_mask);
+
+    // Get destination shape
+    const TensorShape dst_shape = compute_strided_slice_output_shape(src_shape, starts_abs, ends_abs, final_strides);
+
+    // Create destination tensor
+    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1 };
+
+    // Perform strided slice
+    Window win;
+    win.use_tensor_dimensions(dst_shape);
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        Coordinates offset;
+        for(unsigned int i = 0; i < id.num_dimensions(); ++i)
+        {
+            offset.set(i, starts_abs[i] + id[i] * final_strides[i]);
+        }
+        *reinterpret_cast<T *>(dst(id)) = *reinterpret_cast<const T *>(src(offset));
+    });
+
+    return dst;
+}
+
+template SimpleTensor<float> strided_slice(const SimpleTensor<float> &src,
+                                           Coordinates starts, Coordinates ends, BiStrides strides,
+                                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+template SimpleTensor<half_float::half> strided_slice(const SimpleTensor<half_float::half> &src,
+                                                      Coordinates starts, Coordinates ends, BiStrides strides,
+                                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticAddition.h b/tests/validation/reference/SliceOperations.h
similarity index 71%
rename from tests/validation/reference/ArithmeticAddition.h
rename to tests/validation/reference/SliceOperations.h
index faeabd7..89fe203 100644
--- a/tests/validation/reference/ArithmeticAddition.h
+++ b/tests/validation/reference/SliceOperations.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__
+#ifndef __ARM_COMPUTE_TEST_SLICE_OPERATIONS_H__
+#define __ARM_COMPUTE_TEST_SLICE_OPERATIONS_H__
 
 #include "tests/SimpleTensor.h"
-#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -36,13 +35,14 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst, ConvertPolicy convert_policy);
+SimpleTensor<T> slice(const SimpleTensor<T> &src, Coordinates starts, Coordinates ends);
 
 template <typename T>
-SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-
+SimpleTensor<T> strided_slice(const SimpleTensor<T> &src,
+                              Coordinates starts, Coordinates ends, BiStrides strides,
+                              int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__ */
+#endif /* __ARM_COMPUTE_TEST_SLICE_OPERATIONS_H__ */
diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp
index aa640ad..f1b94c0 100644
--- a/tests/validation/reference/SoftmaxLayer.cpp
+++ b/tests/validation/reference/SoftmaxLayer.cpp
@@ -34,26 +34,38 @@
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta)
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
 {
     // Create reference
     SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
 
-    // Compute reference
-    const int cols       = src.shape()[0];
-    const int upper_dims = src.num_elements() / cols;
+    // Compute reference. Lower dims are the collapsing of the first axis
+    // dimensions (i.e., the flattened dimension of each batch). The upper dims are
+    // instead the batches we want to normalize
+
+    int lower_dims = 1;
+    for(size_t i = 0; i < axis; i++)
+    {
+        lower_dims *= src.shape()[i];
+    }
+
+    int upper_dims = 1;
+    for(size_t i = axis; i < TensorShape::num_max_dimensions; i++)
+    {
+        upper_dims *= src.shape()[i];
+    }
 
     for(int r = 0; r < upper_dims; ++r)
     {
-        const T *src_row_ptr = src.data() + r * cols;
-        T       *dst_row_ptr = dst.data() + r * cols;
+        const T *src_row_ptr = src.data() + r * lower_dims;
+        T       *dst_row_ptr = dst.data() + r * lower_dims;
 
         // Find max
-        const T max = *std::max_element(src_row_ptr, src_row_ptr + cols);
+        const T max = *std::max_element(src_row_ptr, src_row_ptr + lower_dims);
 
         // Regularize
         T sum(0.f);
-        std::transform(src_row_ptr, src_row_ptr + cols, dst_row_ptr, [&sum, max, beta](T val)
+        std::transform(src_row_ptr, src_row_ptr + lower_dims, dst_row_ptr, [&sum, max, beta](T val)
         {
             const T res(std::exp((val - max) * beta));
             sum += res;
@@ -61,7 +73,7 @@
         });
 
         // Normalize
-        std::transform(dst_row_ptr, dst_row_ptr + cols, dst_row_ptr, [sum](T val)
+        std::transform(dst_row_ptr, dst_row_ptr + lower_dims, dst_row_ptr, [sum](T val)
         {
             return val / sum;
         });
@@ -71,20 +83,20 @@
 }
 
 template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta)
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
 {
     // Note: Output quantization info should always have scale = 1/256 and offset = 0
     const QuantizationInfo output_quantization_info = QuantizationInfo(1.f / 256, 0);
 
     SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta);
+    SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta, axis);
     SimpleTensor<T>     dst     = convert_to_asymmetric(dst_tmp, output_quantization_info);
     return dst;
 }
 
-template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float beta);
-template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta);
-template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta);
+template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float beta, size_t axis);
+template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta, size_t axis);
+template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta, size_t axis);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/SoftmaxLayer.h b/tests/validation/reference/SoftmaxLayer.h
index 21dca1e..d21ca2b 100644
--- a/tests/validation/reference/SoftmaxLayer.h
+++ b/tests/validation/reference/SoftmaxLayer.h
@@ -36,10 +36,10 @@
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta);
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis = 1);
 
 template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type = 0>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta);
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis = 1);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/SpaceToBatch.cpp b/tests/validation/reference/SpaceToBatch.cpp
new file mode 100644
index 0000000..979ab94
--- /dev/null
+++ b/tests/validation/reference/SpaceToBatch.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "SpaceToBatch.h"
+
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+// Space to Batch
+template <typename T>
+SimpleTensor<T> space_to_batch(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &block_shape, const SimpleTensor<int32_t> &paddings, const TensorShape &dst_shape)
+{
+    SimpleTensor<T> result(dst_shape, src.data_type());
+
+    auto width_out  = static_cast<int>(dst_shape[0]);
+    auto height_out = static_cast<int>(dst_shape[1]);
+    auto z_out      = static_cast<int>(dst_shape[2]);
+
+    int out_pos = 0;
+    for(int batch = 0; batch < static_cast<int>(dst_shape[3]); ++batch)
+    {
+        for(int z = 0; z < z_out; ++z)
+        {
+            for(int y = 0; y < height_out; ++y)
+            {
+                for(int x = 0; x < width_out; ++x)
+                {
+                    if(x < paddings[0] || x > width_out - paddings[1] - 1
+                       || y < paddings[2] || y > height_out - paddings[3] - 1)
+                    {
+                        result[out_pos] = 0;
+                    }
+                    else
+                    {
+                        const int r      = dst_shape[3] / (block_shape[0] * block_shape[1]);
+                        const int in_x   = (block_shape[0] * (x - paddings[0]) + (batch / r) % block_shape[0]);
+                        const int in_y   = (block_shape[1] * (y - paddings[2]) + (batch / r) / block_shape[0]);
+                        int       in_pos = in_x + src.shape()[0] * in_y + z * src.shape()[0] * src.shape()[1] + (batch % r) * src.shape()[0] * src.shape()[1] * src.shape()[2];
+                        result[out_pos]  = src[in_pos];
+                    }
+                    ++out_pos;
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+template SimpleTensor<float> space_to_batch(const SimpleTensor<float> &src, const SimpleTensor<int32_t> &block_shape, const SimpleTensor<int32_t> &paddings, const TensorShape &dst_shape);
+template SimpleTensor<half> space_to_batch(const SimpleTensor<half> &src, const SimpleTensor<int32_t> &block_shape, const SimpleTensor<int32_t> &paddings, const TensorShape &dst_shape);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticSubtraction.h b/tests/validation/reference/SpaceToBatch.h
similarity index 72%
copy from tests/validation/reference/ArithmeticSubtraction.h
copy to tests/validation/reference/SpaceToBatch.h
index 9308314..ba35211 100644
--- a/tests/validation/reference/ArithmeticSubtraction.h
+++ b/tests/validation/reference/SpaceToBatch.h
@@ -1,9 +1,9 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * Permission is hereby granted, free of charge, to any  person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
+#ifndef __ARM_COMPUTE_TEST_SPACE_TO_BATCH_LAYER_H__
+#define __ARM_COMPUTE_TEST_SPACE_TO_BATCH_LAYER_H__
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -35,10 +35,10 @@
 {
 namespace reference
 {
-template <typename T1, typename T2, typename T3>
-SimpleTensor<T3> arithmetic_subtraction(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template <typename T>
+SimpleTensor<T> space_to_batch(const SimpleTensor<T> &src, const SimpleTensor<int32_t> &block_shape, const SimpleTensor<int32_t> &paddings, const TensorShape &dst_shape);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__ */
+#endif /* __ARM_COMPUTE_TEST_SPACE_TO_BATCH_LAYER_H__ */
diff --git a/tests/validation/reference/UpsampleLayer.cpp b/tests/validation/reference/UpsampleLayer.cpp
new file mode 100644
index 0000000..876f6d7
--- /dev/null
+++ b/tests/validation/reference/UpsampleLayer.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "UpsampleLayer.h"
+
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> upsample_layer(const SimpleTensor<T> &src,
+                               const Size2D &info, const InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON(policy != InterpolationPolicy::NEAREST_NEIGHBOR);
+    ARM_COMPUTE_UNUSED(policy);
+
+    TensorShape output_shape = src.shape();
+    output_shape.set(0, src.shape().x() * info.x());
+    output_shape.set(1, src.shape().y() * info.y());
+
+    // Create reference
+    const int       stride_x   = info.x();
+    const int       stride_y   = info.y();
+    int             width_out  = output_shape.x();
+    int             height_out = output_shape.y();
+    SimpleTensor<T> out{ output_shape, src.data_type(), 1, src.quantization_info() };
+
+    const int width_in      = src.shape().x();
+    const int height_in     = src.shape().y();
+    const int num_2d_slices = src.shape().total_size() / (width_in * height_in);
+
+    for(int slice = 0; slice < num_2d_slices; ++slice)
+    {
+        const int offset_slice_in  = slice * width_in * height_in;
+        const int offset_slice_out = slice * height_out * width_out;
+        for(int y = 0; y < height_out; ++y)
+        {
+            for(int x = 0; x < width_out; ++x)
+            {
+                const int out_offset = y * width_out + x;
+                const int in_offset  = (y / stride_y) * width_in + x / stride_x;
+
+                T       *_out = out.data() + offset_slice_out + out_offset;
+                const T *in   = src.data() + offset_slice_in + in_offset;
+                *_out         = *in;
+            }
+        }
+    }
+
+    return out;
+}
+
+template SimpleTensor<float> upsample_layer(const SimpleTensor<float> &src,
+                                            const Size2D &info, const InterpolationPolicy policy);
+template SimpleTensor<half> upsample_layer(const SimpleTensor<half> &src,
+                                           const Size2D &info, const InterpolationPolicy policy);
+template SimpleTensor<uint8_t> upsample_layer(const SimpleTensor<uint8_t> &src,
+                                              const Size2D &info, const InterpolationPolicy policy);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticSubtraction.h b/tests/validation/reference/UpsampleLayer.h
similarity index 76%
copy from tests/validation/reference/ArithmeticSubtraction.h
copy to tests/validation/reference/UpsampleLayer.h
index 9308314..ecb458a 100644
--- a/tests/validation/reference/ArithmeticSubtraction.h
+++ b/tests/validation/reference/UpsampleLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__
+#ifndef __ARM_COMPUTE_TEST_UPSAMPLE_LAYER_H__
+#define __ARM_COMPUTE_TEST_UPSAMPLE_LAYER_H__
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -35,10 +35,11 @@
 {
 namespace reference
 {
-template <typename T1, typename T2, typename T3>
-SimpleTensor<T3> arithmetic_subtraction(const SimpleTensor<T1> &src1, const SimpleTensor<T2> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template <typename T>
+SimpleTensor<T> upsample_layer(const SimpleTensor<T> &src,
+                               const Size2D &info, const InterpolationPolicy policy);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_SUBTRACTION_H__ */
+#endif /* __ARM_COMPUTE_TEST_UPSAMPLE_LAYER_H__ */
diff --git a/tests/validation/reference/WidthConcatenateLayer.cpp b/tests/validation/reference/WidthConcatenateLayer.cpp
index 8662199..6be171b 100644
--- a/tests/validation/reference/WidthConcatenateLayer.cpp
+++ b/tests/validation/reference/WidthConcatenateLayer.cpp
@@ -59,20 +59,24 @@
     {
         ARM_COMPUTE_ERROR_ON(width_offset >= width_out);
 
-        const int width  = src.shape().x();
-        const int height = src.shape().y();
-        const int depth  = src.shape().z();
+        const int width      = src.shape().x();
+        const int height     = src.shape().y();
+        const int depth      = src.shape().z();
+        const int upper_dims = src.shape().total_size() / (width * height * depth);
 
         const T *src_ptr = src.data();
         T       *dst_ptr = dst.data();
 
-        for(int d = 0; d < depth; ++d)
+        for(int u = 0; u < upper_dims; ++u)
         {
-            for(int r = 0; r < height; ++r)
+            for(int d = 0; d < depth; ++d)
             {
-                int offset = d * height + r;
-                std::copy(src_ptr, src_ptr + width, dst_ptr + width_offset + offset * width_out);
-                src_ptr += width;
+                for(int r = 0; r < height; ++r)
+                {
+                    const int offset = u * height * depth + d * height + r;
+                    std::copy(src_ptr, src_ptr + width, dst_ptr + width_offset + offset * width_out);
+                    src_ptr += width;
+                }
             }
         }
 
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 132d252..294993b 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -75,6 +75,18 @@
         0.f, -1.f, 0.f, 21.f / 4.f, 0.f, -21.f / 4.f, 0.f, 1.f
     };
 
+    static const float imatrix2x1_7x7[] =
+    {
+        -36.0f, 0.0f, 49.0f, 0.0f, -14.0f, 0.0f, 1.0f, 0.0f,
+        0.0f, -36.0f, 36.0f, 13.0f, -13.0f, -1.0f, 1.0f, 0.0f,
+        0.0f, 36.0f, 36.0f, -13.0f, -13.0f, 1.0f, 1.0f, 0.0f,
+        0.0f, -18.0f, 9.0f, 20.0f, -10.0f, -2.0f, 1.0f, 0.0f,
+        0.0f, 18.0f, 9.0f, -20.0f, -10.0f, 2.0f, 1.0f, 0.0f,
+        0.0f, -12.0f, 4.0f, 15.0f, -5.0f, -3.0f, 1.0f, 0.0f,
+        0.0f, 12.0f, 4.0f, -15.0f, -5.0f, 3.0f, 1.0f, 0.0f,
+        0.0f, -36.0f, 0.0f, 49.0f, 0.0f, -14.0f, 0.0f, 1.0f
+    };
+
     // ------------------------------------------
 
     // Winograd filter transform matrices
@@ -109,6 +121,18 @@
 
     };
 
+    static const float fmatrix2x1_7x7[] =
+    {
+        -1.0f / 36.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        1.0f / 48.0f, -1.0f / 48.0f, 1.0f / 48.0f, -1.0f / 48.0f, 1.0f / 48.0f, -1.0f / 48.0f, 1.0f / 48.0f,
+        1.0f / 48.0f, 1.0f / 48.0f, 1.0f / 48.0f, 1.0f / 48.0f, 1.0f / 48.0f, 1.0f / 48.0f, 1.0f / 48.0f,
+        -1.0f / 120.0f, 1.0f / 60.0f, -1.0f / 30.0f, 1.0f / 15.0f, -2.0f / 15.0f, 4.0f / 15.0f, -8.0f / 15.0f,
+        -1.0f / 120.0f, -1.0f / 60.0f, -1.0f / 30.0f, -1.0f / 15.0f, -2.0f / 15.0f, -4.0f / 15.0f, -8.0f / 15.0f,
+        1.0f / 720.0f, -1.0f / 240.0f, 1.0f / 80.0f, -3.0f / 80.0f, 9.0f / 80.0f, -27.0f / 80.0f, 81.0f / 80.0f,
+        1.0f / 720.0f, 1.0f / 240.0f, 1.0f / 80.0f, 3.0f / 80.0f, 9.0f / 80.0f, 27.0f / 80.0f, 81.0f / 80.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f
+    };
+
     // ------------------------------------------
 
     // Winograd output transform matrices
@@ -134,6 +158,12 @@
         0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f, -1.0f, 1.0f
     };
 
+    static const float omatrix2x1_7x7[] =
+    {
+        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f,
+        0.0f, -1.0f, 1.0f, -2.0f, 2.0f, -3.0f, 3.0f, 1.0f
+    };
+
     // ------------------------------------------
 
     using WinogradKey = std::tuple<std::pair<int, int>, std::pair<int, int>, WinogradTransformType>;
@@ -149,6 +179,8 @@
         { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3), WinogradTransformType::INPUT), imatrix4x4_3x3 },
         { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::INPUT), imatrix4x4_5x5 },
         { WinogradKey(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1), WinogradTransformType::INPUT), imatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(2, 1), std::pair<int, int>(7, 1), WinogradTransformType::INPUT), imatrix2x1_7x7 },
+        { WinogradKey(std::pair<int, int>(1, 2), std::pair<int, int>(1, 7), WinogradTransformType::INPUT), imatrix2x1_7x7 },
         { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5), WinogradTransformType::INPUT), imatrix4x4_5x5 },
         { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3), WinogradTransformType::FILTER), fmatrix2x2_3x3 },
         { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3), WinogradTransformType::FILTER), fmatrix4x4_3x3 },
@@ -158,6 +190,8 @@
         { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3), WinogradTransformType::FILTER), fmatrix4x4_3x3 },
         { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::FILTER), fmatrix4x4_5x5 },
         { WinogradKey(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1), WinogradTransformType::FILTER), fmatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(2, 1), std::pair<int, int>(7, 1), WinogradTransformType::FILTER), fmatrix2x1_7x7 },
+        { WinogradKey(std::pair<int, int>(1, 2), std::pair<int, int>(1, 7), WinogradTransformType::FILTER), fmatrix2x1_7x7 },
         { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5), WinogradTransformType::FILTER), fmatrix4x4_5x5 },
         { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3), WinogradTransformType::OUTPUT), omatrix2x2_3x3 },
         { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3), WinogradTransformType::OUTPUT), omatrix4x4_3x3 },
@@ -167,6 +201,8 @@
         { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3), WinogradTransformType::OUTPUT), omatrix4x4_3x3 },
         { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::OUTPUT), omatrix4x4_5x5 },
         { WinogradKey(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1), WinogradTransformType::OUTPUT), omatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(2, 1), std::pair<int, int>(7, 1), WinogradTransformType::OUTPUT), omatrix2x1_7x7 },
+        { WinogradKey(std::pair<int, int>(1, 2), std::pair<int, int>(1, 7), WinogradTransformType::OUTPUT), omatrix2x1_7x7 },
         { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5), WinogradTransformType::OUTPUT), omatrix4x4_5x5 },
     };
 
@@ -232,7 +268,7 @@
     initialize_matrix_transform(matrix, output_tile_size, kernel_size, WinogradTransformType::INPUT);
 
     // Transpose matrix
-    transpose_matrix(matrix, matrix_transposed);
+    transpose_matrix<T>(matrix, matrix_transposed);
 
     const int in_w        = in.shape().x();
     const int in_h        = in.shape().y();
@@ -293,14 +329,14 @@
                     int yi = y * step_y - conv_info.pad_top();
 
                     // Get the tile from the input tensor
-                    get_tile(in, src_tile, Coordinates(xi, yi, z, b));
+                    get_tile<T>(in, src_tile, Coordinates(xi, yi, z, b));
 
                     // Fill partially with zeros in case of 1D convolution
-                    zeros(src_tile, anchor_zeros, shape_zeros);
+                    zeros<T>(src_tile, anchor_zeros, shape_zeros);
 
                     // Compute the transformation
-                    matrix_multiply(matrix, src_tile, tmp_tile);
-                    matrix_multiply(tmp_tile, matrix_transposed, dst_tile);
+                    matrix_multiply<T>(matrix, src_tile, tmp_tile);
+                    matrix_multiply<T>(tmp_tile, matrix_transposed, dst_tile);
 
                     // Store the output tile across the channels
                     for(int i = 0; i < out_d; ++i)
@@ -358,7 +394,7 @@
     initialize_matrix_transform(trans_matrix, output_tile_size, kernel_size, WinogradTransformType::FILTER);
 
     // Transpose the transformation matrix
-    transpose_matrix(trans_matrix, trans_matrix_transposed);
+    transpose_matrix<T>(trans_matrix, trans_matrix_transposed);
 
     const int num_channels = in.shape()[2];
     const int num_filters  = in.shape()[3];
@@ -374,13 +410,13 @@
             for(int z = 0; z < num_channels; ++z)
             {
                 // Load the tile from the input tensor
-                get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+                get_tile<T>(in, input_tile, Coordinates(0, 0, z, w, n));
 
                 // First transformation
-                matrix_multiply(trans_matrix, input_tile, tmp_tile);
+                matrix_multiply<T>(trans_matrix, input_tile, tmp_tile);
 
                 // Second transformation
-                matrix_multiply(tmp_tile, trans_matrix_transposed, transf_tile);
+                matrix_multiply<T>(tmp_tile, trans_matrix_transposed, transf_tile);
 
                 // Store the output tile across the channels
                 const int output_offset = w + z * num_filters;
@@ -451,7 +487,7 @@
     initialize_matrix_transform(trans_matrix, output_tile_size, kernel_size, WinogradTransformType::OUTPUT);
 
     // Transpose the transformation matrix
-    transpose_matrix(trans_matrix, trans_matrix_transposed);
+    transpose_matrix<T>(trans_matrix, trans_matrix_transposed);
 
     const int w_in        = in.shape()[0];
     const int h_in        = in.shape()[1];
@@ -487,7 +523,7 @@
     const int step_y_transf_tile = kernel_size.width == 1 ? 1 : output_tile.shape()[0];
 
     // Initialize with zeros the input tile
-    zeros(input_tile, Coordinates(0, 0), input_tile.shape());
+    zeros<T>(input_tile, Coordinates(0, 0), input_tile.shape());
 
     for(int n = 0; n < num_batches; ++n)
     {
@@ -502,10 +538,10 @@
                 }
 
                 // First transformation
-                matrix_multiply(trans_matrix, input_tile, tmp_tile);
+                matrix_multiply<T>(trans_matrix, input_tile, tmp_tile);
 
                 // Second transformation
-                matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
+                matrix_multiply<T>(tmp_tile, trans_matrix_transposed, output_tile);
 
                 // Store the output tile
                 const int xo = (y % num_tiles_x) * out_tile_w;
@@ -538,6 +574,10 @@
 template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
 template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
 template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const SimpleTensor<float> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<half> winograd_filter_transform(const SimpleTensor<half> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<half> winograd_input_transform(const SimpleTensor<half> &in, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+template SimpleTensor<half> winograd_output_transform(const SimpleTensor<half> &in, const SimpleTensor<half> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info);
+
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/YOLOLayer.cpp b/tests/validation/reference/YOLOLayer.cpp
new file mode 100644
index 0000000..a12f411
--- /dev/null
+++ b/tests/validation/reference/YOLOLayer.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "YOLOLayer.h"
+
+#include "ActivationLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+SimpleTensor<T> yolo_layer(const SimpleTensor<T> &src, const ActivationLayerInfo &info, int32_t num_classes)
+{
+    // Create reference
+    SimpleTensor<T> dst{ src.shape(), src.data_type() };
+
+    // Compute reference
+    const T a(info.a());
+    const T b(info.b());
+
+    for(int i = 0; i < src.num_elements(); ++i)
+    {
+        const size_t z = index2coord(dst.shape(), i).z() % (num_classes + 5);
+
+        if(z != 2 && z != 3)
+        {
+            dst[i] = activate_float<T>(src[i], a, b, info.activation());
+        }
+        else
+        {
+            dst[i] = src[i];
+        }
+    }
+
+    return dst;
+}
+
+template <>
+SimpleTensor<uint8_t> yolo_layer<uint8_t>(const SimpleTensor<uint8_t> &src, const ActivationLayerInfo &info, int32_t num_classes)
+{
+    SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float>   dst_tmp = yolo_layer<float>(src_tmp, info, num_classes);
+    SimpleTensor<uint8_t> dst     = convert_to_asymmetric(dst_tmp, src.quantization_info());
+    return dst;
+}
+
+template SimpleTensor<float> yolo_layer(const SimpleTensor<float> &src, const ActivationLayerInfo &info, int32_t num_classes);
+template SimpleTensor<half> yolo_layer(const SimpleTensor<half> &src, const ActivationLayerInfo &info, int32_t num_classes);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/ArithmeticAddition.h b/tests/validation/reference/YOLOLayer.h
similarity index 70%
copy from tests/validation/reference/ArithmeticAddition.h
copy to tests/validation/reference/YOLOLayer.h
index faeabd7..659f1dd 100644
--- a/tests/validation/reference/ArithmeticAddition.h
+++ b/tests/validation/reference/YOLOLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__
-#define __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__
+#ifndef __ARM_COMPUTE_TEST_YOLO_LAYER_H__
+#define __ARM_COMPUTE_TEST_YOLO_LAYER_H__
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -35,14 +35,13 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst, ConvertPolicy convert_policy);
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+SimpleTensor<T> yolo_layer(const SimpleTensor<T> &src, const ActivationLayerInfo &info, int32_t num_classes);
 
-template <typename T>
-SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+SimpleTensor<T> yolo_layer(const SimpleTensor<T> &src, const ActivationLayerInfo &info, int32_t num_classes);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_ADDITION_H__ */
+#endif /* __ARM_COMPUTE_TEST_YOLO_LAYER_H__ */