arm_compute v18.08
diff --git a/tests/validation/reference/AbsoluteDifference.cpp b/tests/validation/reference/AbsoluteDifference.cpp
index f518e67..f9fce5b 100644
--- a/tests/validation/reference/AbsoluteDifference.cpp
+++ b/tests/validation/reference/AbsoluteDifference.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "AbsoluteDifference.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
diff --git a/tests/validation/reference/Accumulate.cpp b/tests/validation/reference/Accumulate.cpp
index 29a2007..7f34be9 100644
--- a/tests/validation/reference/Accumulate.cpp
+++ b/tests/validation/reference/Accumulate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "Accumulate.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
diff --git a/tests/validation/reference/ActivationLayer.cpp b/tests/validation/reference/ActivationLayer.cpp
index df7f653..9455eff 100644
--- a/tests/validation/reference/ActivationLayer.cpp
+++ b/tests/validation/reference/ActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "ActivationLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -39,7 +38,7 @@
 SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo info)
 {
     // Create reference
-    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1, src.fixed_point_position() };
+    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
 
     // Compute reference
     const T a(info.a());
@@ -92,68 +91,6 @@
     return dst;
 }
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type>
-SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo info)
-{
-    using namespace fixed_point_arithmetic;
-
-    // Create reference
-    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1, src.fixed_point_position() };
-
-    // Compute reference
-    const int            fixed_point_position = src.fixed_point_position();
-    const fixed_point<T> a(info.a(), fixed_point_position);
-    const fixed_point<T> b(info.b(), fixed_point_position);
-    const fixed_point<T> const_0(0, fixed_point_position);
-    const fixed_point<T> const_1(1, fixed_point_position);
-
-    for(int i = 0; i < src.num_elements(); ++i)
-    {
-        fixed_point<T> x(src[i], fixed_point_position, true);
-
-        switch(info.activation())
-        {
-            case ActivationLayerInfo::ActivationFunction::ABS:
-                dst[i] = abs(x).raw();
-                break;
-            case ActivationLayerInfo::ActivationFunction::LINEAR:
-                dst[i] = add(b, mul(a, x)).raw();
-                break;
-            case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                dst[i] = (const_1 / (const_1 + exp(-x))).raw();
-                break;
-            case ActivationLayerInfo::ActivationFunction::RELU:
-                dst[i] = max(const_0, x).raw();
-                break;
-            case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                dst[i] = min(a, max(const_0, x)).raw();
-                break;
-            case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                dst[i] = min(a, max(b, x)).raw();
-                break;
-            case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                dst[i] = (x > const_0) ? x.raw() : mul(a, x).raw();
-                break;
-            case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                dst[i] = log(const_1 + exp(x)).raw();
-                break;
-            case ActivationLayerInfo::ActivationFunction::SQRT:
-                dst[i] = (const_1 / inv_sqrt(x)).raw();
-                break;
-            case ActivationLayerInfo::ActivationFunction::SQUARE:
-                dst[i] = mul(x, x).raw();
-                break;
-            case ActivationLayerInfo::ActivationFunction::TANH:
-                dst[i] = mul(a, tanh(mul(b, x))).raw();
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-        }
-    }
-
-    return dst;
-}
-
 template <>
 SimpleTensor<uint8_t> activation_layer<uint8_t>(const SimpleTensor<uint8_t> &src, ActivationLayerInfo info)
 {
@@ -165,8 +102,6 @@
 
 template SimpleTensor<float> activation_layer(const SimpleTensor<float> &src, ActivationLayerInfo info);
 template SimpleTensor<half> activation_layer(const SimpleTensor<half> &src, ActivationLayerInfo info);
-template SimpleTensor<qint8_t> activation_layer(const SimpleTensor<qint8_t> &src, ActivationLayerInfo info);
-template SimpleTensor<qint16_t> activation_layer(const SimpleTensor<qint16_t> &src, ActivationLayerInfo info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ArithmeticAddition.cpp b/tests/validation/reference/ArithmeticAddition.cpp
index 17020a6..c68c6d4 100644
--- a/tests/validation/reference/ArithmeticAddition.cpp
+++ b/tests/validation/reference/ArithmeticAddition.cpp
@@ -24,7 +24,6 @@
 #include "ArithmeticAddition.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -86,10 +85,8 @@
 } // namespace
 
 template <typename T>
-SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType dst_data_type, ConvertPolicy convert_policy)
+SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst, ConvertPolicy convert_policy)
 {
-    SimpleTensor<T> dst(TensorShape::broadcast_shape(src1.shape(), src2.shape()), dst_data_type);
-
     Coordinates id_src1, id_src2, id_dst;
 
     BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(src1, src2, dst, convert_policy, id_src1, id_src2, id_dst);
@@ -97,11 +94,53 @@
     return dst;
 }
 
-template SimpleTensor<uint8_t> arithmetic_addition(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+template <>
+SimpleTensor<uint8_t> arithmetic_addition(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, SimpleTensor<uint8_t> &dst, ConvertPolicy convert_policy)
+{
+    if(dst.data_type() == DataType::QASYMM8)
+    {
+        SimpleTensor<float> src1_tmp = convert_from_asymmetric(src1);
+        SimpleTensor<float> src2_tmp = convert_from_asymmetric(src2);
+        SimpleTensor<float> dst_tmp(TensorShape::broadcast_shape(src1.shape(), src2.shape()), dst.data_type());
+
+        Coordinates id_src1, id_src2, id_dst;
+
+        BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(src1_tmp, src2_tmp, dst_tmp, convert_policy, id_src1, id_src2, id_dst);
+
+        dst = convert_to_asymmetric(dst_tmp, dst.quantization_info());
+        return dst;
+    }
+    else
+    {
+        // DataType::U8
+        Coordinates id_src1, id_src2, id_dst;
+
+        BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(src1, src2, dst, convert_policy, id_src1, id_src2, id_dst);
+
+        return dst;
+    }
+}
+
+template SimpleTensor<int16_t> arithmetic_addition(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, SimpleTensor<int16_t> &dst, ConvertPolicy convert_policy);
+template SimpleTensor<int8_t> arithmetic_addition(const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, SimpleTensor<int8_t> &dst, ConvertPolicy convert_policy);
+template SimpleTensor<half> arithmetic_addition(const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, SimpleTensor<half> &dst, ConvertPolicy convert_policy);
+template SimpleTensor<float> arithmetic_addition(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, SimpleTensor<float> &dst, ConvertPolicy convert_policy);
+
+template <typename T>
+SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType dst_data_type, ConvertPolicy convert_policy)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(dst_data_type == DataType::QASYMM8, "For QASYMM8, the quantized output tensor should be passed directly.");
+
+    SimpleTensor<T> dst(TensorShape::broadcast_shape(src1.shape(), src2.shape()), dst_data_type);
+    arithmetic_addition<T>(src1, src2, dst, convert_policy);
+    return dst;
+}
+
 template SimpleTensor<int16_t> arithmetic_addition(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 template SimpleTensor<int8_t> arithmetic_addition(const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 template SimpleTensor<half> arithmetic_addition(const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 template SimpleTensor<float> arithmetic_addition(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ArithmeticAddition.h b/tests/validation/reference/ArithmeticAddition.h
index 5902a6f..faeabd7 100644
--- a/tests/validation/reference/ArithmeticAddition.h
+++ b/tests/validation/reference/ArithmeticAddition.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,11 @@
 namespace reference
 {
 template <typename T>
+SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst, ConvertPolicy convert_policy);
+
+template <typename T>
 SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
+
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ArithmeticDivision.cpp b/tests/validation/reference/ArithmeticDivision.cpp
new file mode 100644
index 0000000..0102231
--- /dev/null
+++ b/tests/validation/reference/ArithmeticDivision.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ArithmeticDivision.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+template <size_t dim>
+struct BroadcastUnroll
+{
+    template <typename T>
+    static void unroll(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst,
+                       Coordinates &id_src1, Coordinates &id_src2, Coordinates &id_dst)
+    {
+        const bool src1_is_broadcast = (src1.shape()[dim - 1] != dst.shape()[dim - 1]);
+        const bool src2_is_broadcast = (src2.shape()[dim - 1] != dst.shape()[dim - 1]);
+
+        id_src1.set(dim - 1, 0);
+        id_src2.set(dim - 1, 0);
+        id_dst.set(dim - 1, 0);
+
+        for(size_t i = 0; i < dst.shape()[dim - 1]; ++i, ++id_dst[dim - 1])
+        {
+            BroadcastUnroll < dim - 1 >::unroll(src1, src2, dst, id_src1, id_src2, id_dst);
+
+            id_src1[dim - 1] += !src1_is_broadcast;
+            id_src2[dim - 1] += !src2_is_broadcast;
+        }
+    }
+};
+
+template <>
+struct BroadcastUnroll<0>
+{
+    template <typename T>
+    static void unroll(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst,
+                       Coordinates &id_src1, Coordinates &id_src2, Coordinates &id_dst)
+    {
+        dst[coord2index(dst.shape(), id_dst)] = src1[coord2index(src1.shape(), id_src1)] / src2[coord2index(src2.shape(), id_src2)];
+    }
+};
+} // namespace
+
+template <typename T>
+SimpleTensor<T> arithmetic_division(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType data_type)
+{
+    SimpleTensor<T> dst(TensorShape::broadcast_shape(src1.shape(), src2.shape()), data_type);
+
+    Coordinates id_src1, id_src2, id_dst;
+
+    BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(src1, src2, dst, id_src1, id_src2, id_dst);
+
+    return dst;
+}
+
+template SimpleTensor<half> arithmetic_division(const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, DataType data_type);
+template SimpleTensor<float> arithmetic_division(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, DataType data_type);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/FixedPoint.h b/tests/validation/reference/ArithmeticDivision.h
similarity index 79%
copy from tests/validation/reference/FixedPoint.h
copy to tests/validation/reference/ArithmeticDivision.h
index f0117f9..5459dab 100644
--- a/tests/validation/reference/FixedPoint.h
+++ b/tests/validation/reference/ArithmeticDivision.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
-#define __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
+#ifndef __ARM_COMPUTE_TEST_ARITHMETIC_DIVISION_H__
+#define __ARM_COMPUTE_TEST_ARITHMETIC_DIVISION_H__
 
 #include "tests/SimpleTensor.h"
-#include "tests/Types.h"
+#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -36,9 +36,9 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> fixed_point_operation(const SimpleTensor<T> &src, FixedPointOp op);
+SimpleTensor<T> arithmetic_division(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, DataType data_type);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__ */
+#endif /* __ARM_COMPUTE_TEST_ARITHMETIC_DIVISION_H__ */
diff --git a/tests/validation/reference/ArithmeticSubtraction.cpp b/tests/validation/reference/ArithmeticSubtraction.cpp
index bed2d37..f39d01f 100644
--- a/tests/validation/reference/ArithmeticSubtraction.cpp
+++ b/tests/validation/reference/ArithmeticSubtraction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "ArithmeticSubtraction.h"
 
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
diff --git a/tests/validation/reference/BatchNormalizationLayer.cpp b/tests/validation/reference/BatchNormalizationLayer.cpp
index c8badac..4ea3769 100644
--- a/tests/validation/reference/BatchNormalizationLayer.cpp
+++ b/tests/validation/reference/BatchNormalizationLayer.cpp
@@ -25,7 +25,6 @@
 
 #include "ActivationLayer.h"
 
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -36,56 +35,11 @@
 {
 namespace reference
 {
-// Batch Normalization Layer for fixed point type
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type *>
-SimpleTensor<T> batch_normalization_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &mean, const SimpleTensor<T> &var, const SimpleTensor<T> &beta, const SimpleTensor<T> &gamma, float epsilon,
-                                          ActivationLayerInfo act_info, int fixed_point_position)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    SimpleTensor<T> result(src.shape(), src.data_type());
-
-    const auto cols       = static_cast<int>(src.shape()[0]);
-    const auto rows       = static_cast<int>(src.shape()[1]);
-    const auto depth      = static_cast<int>(src.shape()[2]);
-    const int  upper_dims = src.shape().total_size() / (cols * rows * depth);
-
-    for(int r = 0; r < upper_dims; ++r)
-    {
-        for(int i = 0; i < depth; ++i)
-        {
-            for(int k = 0; k < rows; ++k)
-            {
-                for(int l = 0; l < cols; ++l)
-                {
-                    const int pos = l + k * cols + i * rows * cols + r * cols * rows * depth;
-
-                    fixed_point_arithmetic::fixed_point<T> src_qs(src[pos], fixed_point_position, true);
-                    fixed_point_arithmetic::fixed_point<T> var_qs(var[i], fixed_point_position, true);
-                    fixed_point_arithmetic::fixed_point<T> mean_qs(mean[i], fixed_point_position, true);
-                    fixed_point_arithmetic::fixed_point<T> beta_qs(beta[i], fixed_point_position, true);
-                    fixed_point_arithmetic::fixed_point<T> gamma_qs(gamma[i], fixed_point_position, true);
-                    fixed_point_arithmetic::fixed_point<T> epsilon_qs(epsilon, fixed_point_position);
-
-                    auto denominator = fixed_point_arithmetic::inv_sqrt(var_qs + epsilon_qs);
-                    auto numerator   = src_qs - mean_qs;
-                    auto x_bar       = numerator * denominator;
-                    x_bar            = beta_qs + x_bar * gamma_qs;
-                    result[pos]      = x_bar.raw();
-                }
-            }
-        }
-    }
-
-    return result;
-}
-
 // Batch Normalization Layer for floating point type
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type *>
 SimpleTensor<T> batch_normalization_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &mean, const SimpleTensor<T> &var, const SimpleTensor<T> &beta, const SimpleTensor<T> &gamma, float epsilon,
-                                          ActivationLayerInfo act_info, int fixed_point_position)
+                                          ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
     SimpleTensor<T> result(src.shape(), src.data_type());
 
     const auto cols       = static_cast<int>(src.shape()[0]);
@@ -119,14 +73,10 @@
     return result;
 }
 template SimpleTensor<float> batch_normalization_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &mean, const SimpleTensor<float> &var, const SimpleTensor<float> &beta,
-                                                       const SimpleTensor<float> &gamma, float epsilon, ActivationLayerInfo act_info, int fixed_point_position);
-template SimpleTensor<int8_t> batch_normalization_layer(const SimpleTensor<int8_t> &src, const SimpleTensor<int8_t> &mean, const SimpleTensor<int8_t> &var, const SimpleTensor<int8_t> &beta,
-                                                        const SimpleTensor<int8_t> &gamma, float epsilon, ActivationLayerInfo act_info, int fixed_point_position);
-template SimpleTensor<int16_t> batch_normalization_layer(const SimpleTensor<int16_t> &src, const SimpleTensor<int16_t> &mean, const SimpleTensor<int16_t> &var, const SimpleTensor<int16_t> &beta,
-                                                         const SimpleTensor<int16_t> &gamma, float epsilon, ActivationLayerInfo act_info, int fixed_point_position);
+                                                       const SimpleTensor<float> &gamma, float epsilon, ActivationLayerInfo act_info);
 template SimpleTensor<half> batch_normalization_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &mean, const SimpleTensor<half> &var,
                                                       const SimpleTensor<half> &beta,
-                                                      const SimpleTensor<half> &gamma, float epsilon, ActivationLayerInfo act_info, int fixed_point_position);
+                                                      const SimpleTensor<half> &gamma, float epsilon, ActivationLayerInfo act_info);
 
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/BatchNormalizationLayer.h b/tests/validation/reference/BatchNormalizationLayer.h
index 329909d..b45d820 100644
--- a/tests/validation/reference/BatchNormalizationLayer.h
+++ b/tests/validation/reference/BatchNormalizationLayer.h
@@ -37,13 +37,11 @@
 {
 template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type * = nullptr>
 SimpleTensor<T> batch_normalization_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &mean, const SimpleTensor<T> &var, const SimpleTensor<T> &beta, const SimpleTensor<T> &gamma, float epsilon,
-                                          ActivationLayerInfo act_info,
-                                          int                 fixed_point_position);
+                                          ActivationLayerInfo act_info);
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type * = nullptr>
 SimpleTensor<T> batch_normalization_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &mean, const SimpleTensor<T> &var, const SimpleTensor<T> &beta, const SimpleTensor<T> &gamma, float epsilon,
-                                          ActivationLayerInfo act_info,
-                                          int                 fixed_point_position);
+                                          ActivationLayerInfo act_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/CannyEdgeDetector.cpp b/tests/validation/reference/CannyEdgeDetector.cpp
new file mode 100644
index 0000000..cfe8ae8
--- /dev/null
+++ b/tests/validation/reference/CannyEdgeDetector.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CannyEdgeDetector.h"
+
+#include "Utils.h"
+#include "support/ToolchainSupport.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Magnitude.h"
+#include "tests/validation/reference/NonMaximaSuppression.h"
+#include "tests/validation/reference/Phase.h"
+#include "tests/validation/reference/Sobel.h"
+
+#include "tests/SimpleTensorPrinter.h"
+
+#include <cmath>
+#include <stack>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+const auto MARK_ZERO  = 0u;
+const auto MARK_MAYBE = 127u;
+const auto MARK_EDGE  = 255u;
+
+template <typename T>
+void trace_edge(SimpleTensor<T> &dst, const ValidRegion &valid_region)
+{
+    std::stack<Coordinates> pixels_stack;
+    for(auto i = 0; i < dst.num_elements(); ++i)
+    {
+        if(dst[i] == MARK_EDGE)
+        {
+            pixels_stack.push(index2coord(dst.shape(), i));
+        }
+    }
+
+    while(!pixels_stack.empty())
+    {
+        const Coordinates pixel_coord = pixels_stack.top();
+        pixels_stack.pop();
+
+        std::array<Coordinates, 8> neighbours =
+        {
+            {
+                Coordinates(pixel_coord.x() - 1, pixel_coord.y() + 0),
+                Coordinates(pixel_coord.x() + 1, pixel_coord.y() + 0),
+                Coordinates(pixel_coord.x() - 1, pixel_coord.y() - 1),
+                Coordinates(pixel_coord.x() + 1, pixel_coord.y() + 1),
+                Coordinates(pixel_coord.x() + 0, pixel_coord.y() - 1),
+                Coordinates(pixel_coord.x() + 0, pixel_coord.y() + 1),
+                Coordinates(pixel_coord.x() + 1, pixel_coord.y() - 1),
+                Coordinates(pixel_coord.x() - 1, pixel_coord.y() + 1)
+            }
+        };
+
+        // Mark MAYBE neighbours as edges since they are next to an EDGE
+        std::for_each(neighbours.begin(), neighbours.end(), [&](Coordinates & coord)
+        {
+            if(is_in_valid_region(valid_region, coord))
+            {
+                const size_t pixel_index = coord2index(dst.shape(), coord);
+                const T      pixel       = dst[pixel_index];
+                if(pixel == MARK_MAYBE)
+                {
+                    dst[pixel_index] = MARK_EDGE;
+                    pixels_stack.push(coord);
+                }
+            }
+        });
+    }
+
+    // Mark all remaining MAYBE pixels as ZERO (not edges)
+    for(auto i = 0; i < dst.num_elements(); ++i)
+    {
+        if(dst[i] == MARK_MAYBE)
+        {
+            dst[i] = MARK_ZERO;
+        }
+    }
+}
+
+template <typename U, typename T>
+SimpleTensor<T> canny_edge_detector_impl(const SimpleTensor<T> &src, int32_t upper, int32_t lower, int gradient_size, MagnitudeType norm_type,
+                                         BorderMode border_mode, T constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(gradient_size != 3 && gradient_size != 5 && gradient_size != 7);
+    ARM_COMPUTE_ERROR_ON(lower < 0 || lower >= upper);
+
+    // Output: T == uint8_t
+    SimpleTensor<T> dst{ src.shape(), src.data_type() };
+    ValidRegion     valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(gradient_size / 2 + 1));
+
+    // Sobel computation: U == int16_t or int32_t
+    SimpleTensor<U> gx, gy;
+    std::tie(gx, gy) = sobel<U>(src, gradient_size, border_mode, constant_border_value, GradientDimension::GRAD_XY);
+
+    using unsigned_U = typename traits::make_unsigned_conditional_t<U>::type;
+    using promoted_U = typename common_promoted_signed_type<U>::intermediate_type;
+
+    // Gradient magnitude and phase (edge direction)
+    const DataType           mag_data_type = gx.data_type() == DataType::S16 ? DataType::U16 : DataType::U32;
+    SimpleTensor<unsigned_U> grad_mag{ gx.shape(), mag_data_type };
+    SimpleTensor<uint8_t>    grad_dir{ gy.shape(), DataType::U8 };
+
+    for(auto i = 0; i < grad_mag.num_elements(); ++i)
+    {
+        double mag = 0.f;
+
+        if(norm_type == MagnitudeType::L2NORM)
+        {
+            mag = support::cpp11::round(std::sqrt(static_cast<promoted_U>(gx[i]) * gx[i] + static_cast<promoted_U>(gy[i]) * gy[i]));
+        }
+        else // MagnitudeType::L1NORM
+        {
+            mag = static_cast<promoted_U>(std::abs(gx[i])) + static_cast<promoted_U>(std::abs(gy[i]));
+        }
+
+        float angle = 180.f * std::atan2(static_cast<float>(gy[i]), static_cast<float>(gx[i])) / M_PI;
+        grad_dir[i] = support::cpp11::round(angle < 0.f ? 180 + angle : angle);
+        grad_mag[i] = saturate_cast<unsigned_U>(mag);
+    }
+
+    /*
+        Quantise the phase into 4 directions
+          0°  dir=0    0.0 <= p <  22.5 or 157.5 <= p < 180
+         45°  dir=1   22.5 <= p <  67.5
+         90°  dir=2   67.5 <= p < 112.5
+        135°  dir=3  112.5 <= p < 157.5
+    */
+    for(auto i = 0; i < grad_dir.num_elements(); ++i)
+    {
+        const auto direction = std::fabs(grad_dir[i]);
+        grad_dir[i]          = (direction < 22.5 || direction >= 157.5) ? 0 : (direction < 67.5) ? 1 : (direction < 112.5) ? 2 : 3;
+    }
+
+    // Non-maximum suppression
+    std::vector<int> strong_edges;
+    const auto       upper_thresh = static_cast<uint32_t>(upper);
+    const auto       lower_thresh = static_cast<uint32_t>(lower);
+
+    const auto pixel_at_offset = [&](const SimpleTensor<unsigned_U> &tensor, const Coordinates & coord, int xoffset, int yoffset)
+    {
+        return tensor_elem_at(tensor, Coordinates{ coord.x() + xoffset, coord.y() + yoffset }, border_mode, static_cast<unsigned_U>(constant_border_value));
+    };
+
+    for(auto i = 0; i < dst.num_elements(); ++i)
+    {
+        const auto coord = index2coord(dst.shape(), i);
+        if(!is_in_valid_region(valid_region, coord) || grad_mag[i] <= lower_thresh)
+        {
+            dst[i] = MARK_ZERO;
+            continue;
+        }
+
+        unsigned_U mag_90, mag90;
+        switch(grad_dir[i])
+        {
+            case 0: // North/South edge direction, compare against East/West pixels (left & right)
+                mag_90 = pixel_at_offset(grad_mag, coord, -1, 0);
+                mag90  = pixel_at_offset(grad_mag, coord, 1, 0);
+                break;
+            case 1: // NE/SW edge direction, compare against NW/SE pixels (top-left & bottom-right)
+                mag_90 = pixel_at_offset(grad_mag, coord, -1, -1);
+                mag90  = pixel_at_offset(grad_mag, coord, +1, +1);
+                break;
+            case 2: // East/West edge direction, compare against North/South pixels (top & bottom)
+                mag_90 = pixel_at_offset(grad_mag, coord, 0, -1);
+                mag90  = pixel_at_offset(grad_mag, coord, 0, +1);
+                break;
+            case 3: // NW/SE edge direction, compare against NE/SW pixels (top-right & bottom-left)
+                mag_90 = pixel_at_offset(grad_mag, coord, +1, -1);
+                mag90  = pixel_at_offset(grad_mag, coord, -1, +1);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Invalid gradient phase provided");
+                break;
+        }
+
+        // Potential edge if greater than both pixels at +/-90° on either side
+        if(grad_mag[i] > mag_90 && grad_mag[i] > mag90)
+        {
+            // Double thresholding and edge tracing
+            if(grad_mag[i] > upper_thresh)
+            {
+                dst[i] = MARK_EDGE; // Definite edge pixel
+                strong_edges.emplace_back(i);
+            }
+            else
+            {
+                dst[i] = MARK_MAYBE;
+            }
+        }
+        else
+        {
+            dst[i] = MARK_ZERO; // Since not greater than neighbours
+        }
+    }
+
+    // Final edge tracing
+    trace_edge<T>(dst, valid_region);
+    return dst;
+}
+} // namespace
+
+template <typename T>
+SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+                                    BorderMode border_mode, T constant_border_value)
+{
+    if(gradient_size < 7)
+    {
+        return canny_edge_detector_impl<int16_t>(src, upper_thresh, lower_thresh, gradient_size, norm_type, border_mode, constant_border_value);
+    }
+    else
+    {
+        return canny_edge_detector_impl<int32_t>(src, upper_thresh, lower_thresh, gradient_size, norm_type, border_mode, constant_border_value);
+    }
+}
+
+template SimpleTensor<uint8_t> canny_edge_detector(const SimpleTensor<uint8_t> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+                                                   BorderMode border_mode, uint8_t constant_border_value);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/FixedPointPixelWiseMultiplication.h b/tests/validation/reference/CannyEdgeDetector.h
similarity index 73%
rename from tests/validation/reference/FixedPointPixelWiseMultiplication.h
rename to tests/validation/reference/CannyEdgeDetector.h
index 124a33c..a46c145 100644
--- a/tests/validation/reference/FixedPointPixelWiseMultiplication.h
+++ b/tests/validation/reference/CannyEdgeDetector.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_FIXED_POINT_PIXEL_WISE_MULTIPLICATION_H__
-#define __ARM_COMPUTE_TEST_FIXED_POINT_PIXEL_WISE_MULTIPLICATION_H__
+#ifndef __ARM_COMPUTE_TEST_CANNY_EDGE_DETECTOR_H__
+#define __ARM_COMPUTE_TEST_CANNY_EDGE_DETECTOR_H__
 
+#include "arm_compute/core/Types.h"
 #include "tests/SimpleTensor.h"
 
 namespace arm_compute
@@ -35,9 +36,10 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> fixed_point_pixel_wise_multiplication(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, float scale, ConvertPolicy convert_policy);
+SimpleTensor<T> canny_edge_detector(const SimpleTensor<T> &src, int32_t upper_thresh, int32_t lower_thresh, int gradient_size, MagnitudeType norm_type,
+                                    BorderMode border_mode, T constant_border_value = 0);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_FIXED_POINT_PIXEL_WISE_MULTIPLICATION_H__ */
+#endif /* __ARM_COMPUTE_TEST_CANNY_EDGE_DETECTOR_H__ */
diff --git a/tests/validation/reference/ChannelCombine.cpp b/tests/validation/reference/ChannelCombine.cpp
index c1ec3ec..b76dcac 100644
--- a/tests/validation/reference/ChannelCombine.cpp
+++ b/tests/validation/reference/ChannelCombine.cpp
@@ -24,7 +24,6 @@
 #include "ChannelCombine.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
diff --git a/tests/validation/reference/ChannelExtract.cpp b/tests/validation/reference/ChannelExtract.cpp
index 595bb13..6f17fc0 100644
--- a/tests/validation/reference/ChannelExtract.cpp
+++ b/tests/validation/reference/ChannelExtract.cpp
@@ -24,7 +24,6 @@
 #include "ChannelExtract.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
diff --git a/tests/validation/reference/ChannelShuffle.cpp b/tests/validation/reference/ChannelShuffle.cpp
index c4d8d50..b8aa920 100644
--- a/tests/validation/reference/ChannelShuffle.cpp
+++ b/tests/validation/reference/ChannelShuffle.cpp
@@ -39,7 +39,7 @@
 SimpleTensor<T> channel_shuffle(const SimpleTensor<T> &src, int num_groups)
 {
     // Create reference
-    SimpleTensor<T> dst{ src.shape(), src.data_type(), src.num_channels(), src.fixed_point_position(), src.quantization_info() };
+    SimpleTensor<T> dst{ src.shape(), src.data_type(), src.num_channels(), src.quantization_info() };
 
     const int M                 = src.shape()[0];
     const int N                 = src.shape()[1];
diff --git a/tests/validation/reference/Col2Im.cpp b/tests/validation/reference/Col2Im.cpp
new file mode 100644
index 0000000..90e488f
--- /dev/null
+++ b/tests/validation/reference/Col2Im.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Col2Im.h"
+
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Utils.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> col2im(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int num_groups)
+{
+    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1 };
+
+    // Compute reference
+    const size_t batches    = dst_shape[3];
+    const size_t src_width  = src.shape().x();
+    const size_t src_height = src.shape().y();
+
+    if(num_groups == 1)
+    {
+        // Batches are on the 3rd dimension of the input tensor
+        int dst_idx = 0;
+        for(size_t b = 0; b < batches; ++b)
+        {
+            for(size_t x = 0; x < src_width; ++x)
+            {
+                for(size_t y = 0; y < src_height; ++y)
+                {
+                    dst[dst_idx++] = src[coord2index(src.shape(), Coordinates(x, y, b))];
+                }
+            }
+        }
+    }
+    else
+    {
+        int dst_idx = 0;
+        for(size_t b = 0; b < batches; ++b)
+        {
+            for(size_t g = 0; g < num_groups; ++g)
+            {
+                for(size_t x = 0; x < src_width; ++x)
+                {
+                    for(size_t y = 0; y < src_height; ++y)
+                    {
+                        dst[dst_idx++] = src[coord2index(src.shape(), Coordinates(x, y, g, b))];
+                    }
+                }
+            }
+        }
+    }
+    return dst;
+}
+
+template SimpleTensor<float> col2im(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int num_groups);
+template SimpleTensor<half> col2im(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int num_groups);
+template SimpleTensor<uint8_t> col2im(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int num_groups);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/FixedPoint.h b/tests/validation/reference/Col2Im.h
similarity index 81%
rename from tests/validation/reference/FixedPoint.h
rename to tests/validation/reference/Col2Im.h
index f0117f9..6082610 100644
--- a/tests/validation/reference/FixedPoint.h
+++ b/tests/validation/reference/Col2Im.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
-#define __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
+#ifndef __ARM_COMPUTE_TEST_COL2IM_H__
+#define __ARM_COMPUTE_TEST_COL2IM_H__
 
 #include "tests/SimpleTensor.h"
-#include "tests/Types.h"
+#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -36,9 +36,9 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> fixed_point_operation(const SimpleTensor<T> &src, FixedPointOp op);
+SimpleTensor<T> col2im(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int num_groups);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__ */
+#endif /* __ARM_COMPUTE_TEST_COL2IM_H__ */
diff --git a/tests/validation/reference/ColorConvert.cpp b/tests/validation/reference/ColorConvert.cpp
new file mode 100644
index 0000000..8047b34
--- /dev/null
+++ b/tests/validation/reference/ColorConvert.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ColorConvert.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/ColorConvertHelper.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+template <typename T>
+inline std::vector<SimpleTensor<T>> create_image_planes(const TensorShape &shape, Format format)
+{
+    TensorShape image_shape = adjust_odd_shape(shape, format);
+
+    std::vector<SimpleTensor<T>> image_planes;
+
+    switch(format)
+    {
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            image_planes.emplace_back(image_shape, format);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            TensorShape shape_uv88 = calculate_subsampled_shape(image_shape, Format::UV88);
+
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(shape_uv88, Format::UV88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorShape shape_sub2 = calculate_subsampled_shape(image_shape, Format::IYUV);
+
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(shape_sub2, Format::U8);
+            image_planes.emplace_back(shape_sub2, Format::U8);
+            break;
+        }
+        case Format::YUV444:
+        {
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(image_shape, Format::U8);
+            image_planes.emplace_back(image_shape, Format::U8);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    return image_planes;
+}
+} // namespace
+
+template <typename T>
+std::vector<SimpleTensor<T>> color_convert(const TensorShape &shape, const std::vector<SimpleTensor<T>> &tensor_planes, Format src_format, Format dst_format)
+{
+    std::vector<SimpleTensor<T>> dst = create_image_planes<T>(shape, dst_format);
+
+    switch(src_format)
+    {
+        case Format::RGB888:
+        {
+            switch(dst_format)
+            {
+                case Format::RGBA8888:
+                    colorconvert_helper::detail::colorconvert_rgb_to_rgbx(tensor_planes[0], dst[0]);
+                    break;
+                case Format::NV12:
+                    colorconvert_helper::detail::colorconvert_rgb_to_nv12(tensor_planes[0], dst);
+                    break;
+                case Format::IYUV:
+                    colorconvert_helper::detail::colorconvert_rgb_to_iyuv(tensor_planes[0], dst);
+                    break;
+                case Format::YUV444:
+                    colorconvert_helper::detail::colorconvert_rgb_to_yuv4(tensor_planes[0], dst);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not Supported");
+                    break;
+            }
+            break;
+        }
+        case Format::RGBA8888:
+        {
+            switch(dst_format)
+            {
+                case Format::RGB888:
+                    colorconvert_helper::detail::colorconvert_rgbx_to_rgb(tensor_planes[0], dst[0]);
+                    break;
+                case Format::NV12:
+                    colorconvert_helper::detail::colorconvert_rgb_to_nv12(tensor_planes[0], dst);
+                    break;
+                case Format::IYUV:
+                    colorconvert_helper::detail::colorconvert_rgb_to_iyuv(tensor_planes[0], dst);
+                    break;
+                case Format::YUV444:
+                    colorconvert_helper::detail::colorconvert_rgb_to_yuv4(tensor_planes[0], dst);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not Supported");
+                    break;
+            }
+            break;
+        }
+        case Format::UYVY422:
+        case Format::YUYV422:
+        {
+            switch(dst_format)
+            {
+                case Format::RGB888:
+                case Format::RGBA8888:
+                    colorconvert_helper::detail::colorconvert_yuyv_to_rgb(tensor_planes[0], src_format, dst[0]);
+                    break;
+                case Format::NV12:
+                    colorconvert_helper::detail::colorconvert_yuyv_to_nv12(tensor_planes[0], src_format, dst);
+                    break;
+                case Format::IYUV:
+                    colorconvert_helper::detail::colorconvert_yuyv_to_iyuv(tensor_planes[0], src_format, dst);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not Supported");
+                    break;
+            }
+            break;
+        }
+        case Format::IYUV:
+        {
+            switch(dst_format)
+            {
+                case Format::RGB888:
+                case Format::RGBA8888:
+                    colorconvert_helper::detail::colorconvert_iyuv_to_rgb(shape, tensor_planes, dst[0]);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not Supported");
+                    break;
+            }
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            switch(dst_format)
+            {
+                case Format::RGB888:
+                case Format::RGBA8888:
+                    colorconvert_helper::detail::colorconvert_nv12_to_rgb(shape, src_format, tensor_planes, dst[0]);
+                    break;
+                case Format::IYUV:
+                    colorconvert_helper::detail::colorconvert_nv_to_iyuv(tensor_planes, src_format, dst);
+                    break;
+                case Format::YUV444:
+                    colorconvert_helper::detail::colorconvert_nv_to_yuv4(tensor_planes, src_format, dst);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not Supported");
+                    break;
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+    return dst;
+}
+
+template std::vector<SimpleTensor<uint8_t>> color_convert(const TensorShape &shape, const std::vector<SimpleTensor<uint8_t>> &tensor_planes, Format src_format, Format dst_format);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/FixedPoint.h b/tests/validation/reference/ColorConvert.h
similarity index 79%
copy from tests/validation/reference/FixedPoint.h
copy to tests/validation/reference/ColorConvert.h
index f0117f9..a49bbba 100644
--- a/tests/validation/reference/FixedPoint.h
+++ b/tests/validation/reference/ColorConvert.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
-#define __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
+#ifndef __ARM_COMPUTE_TEST_COLOR_CONVERT_H__
+#define __ARM_COMPUTE_TEST_COLOR_CONVERT_H__
 
 #include "tests/SimpleTensor.h"
-#include "tests/Types.h"
 
 namespace arm_compute
 {
@@ -36,9 +35,9 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> fixed_point_operation(const SimpleTensor<T> &src, FixedPointOp op);
+std::vector<SimpleTensor<T>> color_convert(const TensorShape &shape, const std::vector<SimpleTensor<T>> &tensor_planes, Format src_format, Format dst_format);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__ */
+#endif /* __ARM_COMPUTE_TEST_COLOR_CONVERT_H__ */
diff --git a/tests/validation/reference/ColorConvertHelper.h b/tests/validation/reference/ColorConvertHelper.h
new file mode 100644
index 0000000..7a8b547
--- /dev/null
+++ b/tests/validation/reference/ColorConvertHelper.h
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *asymm_int_mult
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, asymm_int_multDAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_VALIDATION_COLOR_CONVERT_H__
+#define __ARM_COMPUTE_TEST_VALIDATION_COLOR_CONVERT_H__
+
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace colorconvert_helper
+{
+namespace detail
+{
+constexpr float red_coef_bt709    = 1.5748F;
+constexpr float green_coef_bt709  = -0.1873f;
+constexpr float green_coef2_bt709 = -0.4681f;
+constexpr float blue_coef_bt709   = 1.8556f;
+
+constexpr float rgb2yuv_bt709_kr = 0.2126f;
+constexpr float rgb2yuv_bt709_kb = 0.0722f;
+// K_g = 1 - K_r - K_b
+constexpr float rgb2yuv_bt709_kg = 0.7152f;
+// C_u = 1 / (2 * (1 - K_b))
+constexpr float rgb2yuv_bt709_cu = 0.5389f;
+// C_v = 1 / (2 * (1 - K_r))
+constexpr float rgb2yuv_bt709_cv = 0.6350f;
+
+template <typename T>
+inline void store_rgb_from_src(const SimpleTensor<T> src, SimpleTensor<T> &rvec, SimpleTensor<T> &gvec, SimpleTensor<T> &bvec)
+{
+    int width  = src.shape().x();
+    int height = src.shape().y();
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; ++x)
+        {
+            const Coordinates src_coord{ x, y };
+            const Coordinates vec_coord{ x, y };
+
+            const auto *src_pixel  = reinterpret_cast<const T *>(src(src_coord));
+            auto       *rvec_pixel = reinterpret_cast<T *>(rvec(vec_coord));
+            auto       *gvec_pixel = reinterpret_cast<T *>(gvec(vec_coord));
+            auto       *bvec_pixel = reinterpret_cast<T *>(bvec(vec_coord));
+
+            rvec_pixel[0] = src_pixel[0];
+            gvec_pixel[0] = src_pixel[1];
+            bvec_pixel[0] = src_pixel[2];
+        }
+    }
+}
+
+template <typename T>
+inline void rgb_to_yuv_calculation(const SimpleTensor<T> rvec, const SimpleTensor<T> gvec, const SimpleTensor<T> bvec, SimpleTensor<T> &yvec, SimpleTensor<T> &uvec_top, SimpleTensor<T> &uvec_bottom,
+                                   SimpleTensor<T> &vvec_top, SimpleTensor<T> &vvec_bottom)
+{
+    int width  = rvec.shape().x();
+    int height = rvec.shape().y();
+
+    int         uvec_coord_x = 0;
+    int         uvec_coord_y = 0;
+    Coordinates uvec_coord{ uvec_coord_x, uvec_coord_y };
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; x += 2)
+        {
+            Coordinates coord{ x, y };
+            auto       *yvec_pixel        = reinterpret_cast<T *>(yvec(coord));
+            auto       *uvec_top_pixel    = reinterpret_cast<T *>(uvec_top(uvec_coord));
+            auto       *uvec_bottom_pixel = reinterpret_cast<T *>(uvec_bottom(uvec_coord));
+            auto       *vvec_top_pixel    = reinterpret_cast<T *>(vvec_top(uvec_coord));
+            auto       *vvec_bottom_pixel = reinterpret_cast<T *>(vvec_bottom(uvec_coord));
+
+            T     border_value(0);
+            int   rvec_val = validation::tensor_elem_at(rvec, coord, BorderMode::CONSTANT, border_value);
+            int   gvec_val = validation::tensor_elem_at(gvec, coord, BorderMode::CONSTANT, border_value);
+            int   bvec_val = validation::tensor_elem_at(bvec, coord, BorderMode::CONSTANT, border_value);
+            float result   = rvec_val * rgb2yuv_bt709_kr + gvec_val * rgb2yuv_bt709_kg + bvec_val * rgb2yuv_bt709_kb;
+
+            yvec_pixel[0]     = result;
+            uvec_top_pixel[0] = (bvec_val - result) * rgb2yuv_bt709_cu + 128.f;
+            vvec_top_pixel[0] = (rvec_val - result) * rgb2yuv_bt709_cv + 128.f;
+
+            coord.set(0, x + 1);
+            rvec_val = validation::tensor_elem_at(rvec, coord, BorderMode::CONSTANT, border_value);
+            gvec_val = validation::tensor_elem_at(gvec, coord, BorderMode::CONSTANT, border_value);
+            bvec_val = validation::tensor_elem_at(bvec, coord, BorderMode::CONSTANT, border_value);
+            result   = rvec_val * rgb2yuv_bt709_kr + gvec_val * rgb2yuv_bt709_kg + bvec_val * rgb2yuv_bt709_kb;
+
+            yvec_pixel[1]        = result;
+            uvec_bottom_pixel[0] = (bvec_val - result) * rgb2yuv_bt709_cu + 128.f;
+            vvec_bottom_pixel[0] = (rvec_val - result) * rgb2yuv_bt709_cv + 128.f;
+
+            uvec_coord.set(0, ++uvec_coord_x);
+        }
+    }
+}
+inline float compute_rgb_value(int y_value, int v_value, int u_value, unsigned char channel_idx)
+{
+    float result = 0.f;
+    switch(channel_idx)
+    {
+        case 0:
+        {
+            const float red = (v_value - 128.f) * red_coef_bt709;
+            result          = y_value + red;
+            break;
+        }
+        case 1:
+        {
+            const float green = (u_value - 128.f) * green_coef_bt709 + (v_value - 128.f) * green_coef2_bt709;
+            result            = y_value + green;
+            break;
+        }
+        case 2:
+        {
+            const float blue = (u_value - 128.f) * blue_coef_bt709;
+            result           = y_value + blue;
+            break;
+        }
+        default:
+        {
+            //Assuming Alpha channel
+            return 255;
+        }
+    }
+    return std::min(std::max(0.f, result), 255.f);
+}
+
+template <typename T>
+inline void yuyv_to_rgb_calculation(const SimpleTensor<T> yvec, const SimpleTensor<T> vvec, const SimpleTensor<T> yyvec, const SimpleTensor<T> uvec, SimpleTensor<T> &dst)
+{
+    const int dst_width  = dst.shape().x();
+    const int dst_height = dst.shape().y();
+    for(int y = 0; y < dst_height; ++y)
+    {
+        int x_coord = 0;
+        for(int x = 0; x < dst_width; x += 2, ++x_coord)
+        {
+            const Coordinates dst_coord{ x, y };
+            auto             *dst_pixel = reinterpret_cast<T *>(dst(dst_coord));
+            const T           border_value(0);
+            const int         yvec_val  = validation::tensor_elem_at(yvec, { x_coord, y }, BorderMode::CONSTANT, border_value);
+            const int         vvec_val  = validation::tensor_elem_at(vvec, { x_coord, y }, BorderMode::CONSTANT, border_value);
+            const int         yyvec_val = validation::tensor_elem_at(yyvec, { x_coord, y }, BorderMode::CONSTANT, border_value);
+            const int         uvec_val  = validation::tensor_elem_at(uvec, { x_coord, y }, BorderMode::CONSTANT, border_value);
+            //Compute first RGB value using Y plane
+            for(int channel_idx = 0; channel_idx < dst.num_channels(); ++channel_idx)
+            {
+                const float channel_value = compute_rgb_value(yvec_val, vvec_val, uvec_val, channel_idx);
+                dst_pixel[channel_idx]    = channel_value;
+            }
+            //Compute second RGB value using YY plane
+            const Coordinates dst_coord2
+            {
+                x + 1, y
+            };
+            dst_pixel = reinterpret_cast<T *>(dst(dst_coord2));
+            for(int channel_idx = 0; channel_idx < dst.num_channels(); ++channel_idx)
+            {
+                const float channel_value = compute_rgb_value(yyvec_val, vvec_val, uvec_val, channel_idx);
+                dst_pixel[channel_idx]    = channel_value;
+            }
+        }
+    }
+}
+
+template <typename T>
+inline void colorconvert_rgb_to_rgbx(const SimpleTensor<T> src, SimpleTensor<T> &dst)
+{
+    for(int channel_idx = 0; channel_idx < dst.num_channels(); ++channel_idx)
+    {
+        const int width  = dst.shape().x();
+        const int height = dst.shape().y();
+
+        for(int y = 0; y < height; ++y)
+        {
+            for(int x = 0; x < width; ++x)
+            {
+                const Coordinates src_coord{ x, y };
+                const Coordinates dst_coord{ x, y };
+
+                const auto *src_pixel = reinterpret_cast<const T *>(src(src_coord));
+                auto       *dst_pixel = reinterpret_cast<T *>(dst(dst_coord));
+                if(channel_idx == 3)
+                {
+                    dst_pixel[channel_idx] = 255;
+                    continue;
+                }
+
+                dst_pixel[channel_idx] = src_pixel[channel_idx];
+            }
+        }
+    }
+}
+
+template <typename T>
+inline void colorconvert_rgbx_to_rgb(const SimpleTensor<T> src, SimpleTensor<T> &dst)
+{
+    for(int channel_idx = 0; channel_idx < dst.num_channels(); ++channel_idx)
+    {
+        const int width  = dst.shape().x();
+        const int height = dst.shape().y();
+
+        for(int y = 0; y < height; ++y)
+        {
+            for(int x = 0; x < width; ++x)
+            {
+                const Coordinates src_coord{ x, y };
+                const Coordinates dst_coord{ x, y };
+
+                const auto *src_pixel = reinterpret_cast<const T *>(src(src_coord));
+                auto       *dst_pixel = reinterpret_cast<T *>(dst(dst_coord));
+
+                dst_pixel[channel_idx] = src_pixel[channel_idx];
+            }
+        }
+    }
+}
+
+template <typename T>
+inline void colorconvert_yuyv_to_rgb(const SimpleTensor<T> src, const Format format, SimpleTensor<T> &dst)
+{
+    SimpleTensor<T> yvec(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
+    SimpleTensor<T> uvec(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
+    SimpleTensor<T> yyvec(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
+    SimpleTensor<T> vvec(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
+
+    const int step_x = (Format::YUYV422 == format || Format::UYVY422 == format) ? 2 : 1;
+    const int offset = (Format::YUYV422 == format) ? 0 : 1;
+
+    Coordinates elem_coord{ 0, 0 };
+    const int   width  = yvec.shape().x();
+    const int   height = yvec.shape().y();
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; ++x)
+        {
+            const Coordinates src_coord{ x * step_x, y };
+            const auto       *src_pixel   = reinterpret_cast<const T *>(src(src_coord));
+            auto             *yvec_pixel  = reinterpret_cast<T *>(yvec(elem_coord));
+            auto             *uvec_pixel  = reinterpret_cast<T *>(uvec(elem_coord));
+            auto             *yyvec_pixel = reinterpret_cast<T *>(yyvec(elem_coord));
+            auto             *vvec_pixel  = reinterpret_cast<T *>(vvec(elem_coord));
+            yvec_pixel[x]                 = src_pixel[0 + offset];
+            uvec_pixel[x]                 = src_pixel[1 - offset];
+            yyvec_pixel[x]                = src_pixel[2 + offset];
+            vvec_pixel[x]                 = src_pixel[3 - offset];
+        }
+        elem_coord.set(1, y + 1);
+    }
+
+    yuyv_to_rgb_calculation(yvec, vvec, yyvec, uvec, dst);
+}
+
+template <typename T>
+inline void colorconvert_iyuv_to_rgb(const TensorShape &shape, const std::vector<SimpleTensor<T>> &tensor_planes, SimpleTensor<T> &dst)
+{
+    SimpleTensor<T> yvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
+    SimpleTensor<T> uvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
+    SimpleTensor<T> yyvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
+    SimpleTensor<T> vvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
+
+    Coordinates elem_coord{ 0, 0 };
+    const int   yvec_width  = yvec.shape().x();
+    const int   yvec_height = yvec.shape().y();
+
+    for(int y = 0; y < yvec_height; ++y)
+    {
+        for(int x = 0; x < yvec_width; ++x)
+        {
+            const Coordinates src_coord{ x, y };
+            const auto       *src_pixel   = reinterpret_cast<const T *>(tensor_planes[0](src_coord));
+            auto             *yvec_pixel  = reinterpret_cast<T *>(yvec(elem_coord));
+            auto             *yyvec_pixel = reinterpret_cast<T *>(yyvec(elem_coord));
+            yvec_pixel[x]                 = src_pixel[x];
+            yyvec_pixel[x]                = src_pixel[x + 1];
+        }
+        elem_coord.set(1, y + 1);
+    }
+
+    const int uvec_width  = uvec.shape().x();
+    const int uvec_height = uvec.shape().y();
+
+    Coordinates top_elem_coord{ 0, 0 };
+    Coordinates bottom_elem_coord{ 0, 1 };
+    for(int y = 0; y < uvec_height; y += 2)
+    {
+        for(int x = 0; x < uvec_width; ++x)
+        {
+            const Coordinates src_coord{ x, y / 2 };
+            const auto       *u_pixel        = reinterpret_cast<const T *>(tensor_planes[1](src_coord));
+            const auto       *v_pixel        = reinterpret_cast<const T *>(tensor_planes[2](src_coord));
+            auto             *uvec_pixel_top = reinterpret_cast<T *>(uvec(top_elem_coord));
+            auto             *vvec_pixel_top = reinterpret_cast<T *>(vvec(top_elem_coord));
+
+            auto *uvec_pixel_bottom = reinterpret_cast<T *>(uvec(bottom_elem_coord));
+            auto *vvec_pixel_bottom = reinterpret_cast<T *>(vvec(bottom_elem_coord));
+            uvec_pixel_top[x]       = u_pixel[0];
+            vvec_pixel_top[x]       = v_pixel[0];
+            uvec_pixel_bottom[x]    = u_pixel[0];
+            vvec_pixel_bottom[x]    = v_pixel[0];
+        }
+        top_elem_coord.set(1, y + 2);
+        bottom_elem_coord.set(1, top_elem_coord.y() + 1);
+    }
+
+    yuyv_to_rgb_calculation(yvec, vvec, yyvec, uvec, dst);
+}
+
+template <typename T>
+inline void colorconvert_nv12_to_rgb(const TensorShape &shape, const Format format, const std::vector<SimpleTensor<T>> &tensor_planes, SimpleTensor<T> &dst)
+{
+    SimpleTensor<T> yvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
+    SimpleTensor<T> uvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
+    SimpleTensor<T> yyvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
+    SimpleTensor<T> vvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
+
+    const int offset = (Format::NV12 == format) ? 0 : 1;
+
+    Coordinates elem_coord{ 0, 0 };
+    const int   yvec_width  = yvec.shape().x();
+    const int   yvec_height = yvec.shape().y();
+
+    for(int y = 0; y < yvec_height; ++y)
+    {
+        for(int x = 0; x < yvec_width; ++x)
+        {
+            const Coordinates src_coord{ x, y };
+            const auto       *src_pixel   = reinterpret_cast<const T *>(tensor_planes[0](src_coord));
+            auto             *yvec_pixel  = reinterpret_cast<T *>(yvec(elem_coord));
+            auto             *yyvec_pixel = reinterpret_cast<T *>(yyvec(elem_coord));
+            yvec_pixel[x]                 = src_pixel[x];
+            yyvec_pixel[x]                = src_pixel[x + 1];
+        }
+        elem_coord.set(1, y + 1);
+    }
+
+    const int uvec_width  = uvec.shape().x();
+    const int uvec_height = uvec.shape().y();
+
+    Coordinates top_elem_coord{ 0, 0 };
+    Coordinates bottom_elem_coord{ 0, 1 };
+    for(int y = 0; y < uvec_height; y += 2)
+    {
+        for(int x = 0; x < uvec_width; ++x)
+        {
+            const Coordinates src_coord{ x, y / 2 };
+            const auto       *src_pixel      = reinterpret_cast<const T *>(tensor_planes[1](src_coord));
+            auto             *uvec_pixel_top = reinterpret_cast<T *>(uvec(top_elem_coord));
+            auto             *vvec_pixel_top = reinterpret_cast<T *>(vvec(top_elem_coord));
+
+            auto *uvec_pixel_bottom = reinterpret_cast<T *>(uvec(bottom_elem_coord));
+            auto *vvec_pixel_bottom = reinterpret_cast<T *>(vvec(bottom_elem_coord));
+            uvec_pixel_top[x]       = src_pixel[0 + offset];
+            vvec_pixel_top[x]       = src_pixel[1 - offset];
+            uvec_pixel_bottom[x]    = src_pixel[0 + offset];
+            vvec_pixel_bottom[x]    = src_pixel[1 - offset];
+        }
+        top_elem_coord.set(1, y + 2);
+        bottom_elem_coord.set(1, top_elem_coord.y() + 1);
+    }
+
+    yuyv_to_rgb_calculation(yvec, vvec, yyvec, uvec, dst);
+}
+
+template <typename T>
+inline void colorconvert_rgb_to_nv12(const SimpleTensor<T> src, std::vector<SimpleTensor<T>> &dst)
+{
+    SimpleTensor<T> rvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+    SimpleTensor<T> gvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+    SimpleTensor<T> bvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+    SimpleTensor<T> yvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+
+    int vec_shape_x = src.shape().x() * src.shape().y();
+
+    SimpleTensor<T> uvec_top(TensorShape{ vec_shape_x, 1U }, Format::U8);
+    SimpleTensor<T> uvec_bottom(TensorShape{ vec_shape_x, 1U }, Format::U8);
+    SimpleTensor<T> vvec_top(TensorShape{ vec_shape_x, 1U }, Format::U8);
+    SimpleTensor<T> vvec_bottom(TensorShape{ vec_shape_x, 1U }, Format::U8);
+
+    store_rgb_from_src(src, rvec, gvec, bvec);
+    rgb_to_yuv_calculation(rvec, gvec, bvec, dst[0], uvec_top, uvec_bottom, vvec_top, vvec_bottom);
+
+    SimpleTensor<T> utmp(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
+    SimpleTensor<T> vtmp(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
+
+    int utmp_width  = utmp.shape().x();
+    int utmp_height = utmp.shape().y();
+
+    int         uvec_coord_x = 0;
+    int         uvec_coord_y = 0;
+    Coordinates uvec_coord{ uvec_coord_x, uvec_coord_y };
+    for(int y = 0; y < utmp_height; y++)
+    {
+        for(int x = 0; x < utmp_width; x++)
+        {
+            Coordinates coord{ x, y };
+            auto       *utmp_pixel = reinterpret_cast<T *>(utmp(coord));
+            auto       *vtmp_pixel = reinterpret_cast<T *>(vtmp(coord));
+
+            T   border_value(0);
+            int uvec_top_val    = validation::tensor_elem_at(uvec_top, uvec_coord, BorderMode::CONSTANT, border_value);
+            int uvec_bottom_val = validation::tensor_elem_at(uvec_bottom, uvec_coord, BorderMode::CONSTANT, border_value);
+            int vvec_top_val    = validation::tensor_elem_at(vvec_top, uvec_coord, BorderMode::CONSTANT, border_value);
+            int vvec_bottom_val = validation::tensor_elem_at(vvec_bottom, uvec_coord, BorderMode::CONSTANT, border_value);
+
+            utmp_pixel[0] = std::ceil(float(uvec_top_val + uvec_bottom_val) / 2);
+            vtmp_pixel[0] = std::ceil(float(vvec_top_val + vvec_bottom_val) / 2);
+
+            uvec_coord.set(0, ++uvec_coord_x);
+        }
+    }
+
+    int second_plane_x = dst[1].shape().x();
+    int second_plane_y = dst[1].shape().y();
+
+    int utmp_coord_x = 0;
+    int utmp_coord_y = 0;
+
+    for(int y = 0; y < second_plane_y; y++)
+    {
+        for(int x = 0; x < second_plane_x; x++)
+        {
+            Coordinates coord{ x, y };
+            Coordinates utmp_top_coord{ utmp_coord_x, utmp_coord_y };
+            Coordinates utmp_bottom_coord{ utmp_coord_x, utmp_coord_y + 1 };
+
+            auto *dst_pixel = reinterpret_cast<T *>(dst[1](coord));
+
+            T   border_value(0);
+            int utmp_top_val    = validation::tensor_elem_at(utmp, utmp_top_coord, BorderMode::CONSTANT, border_value);
+            int utmp_bottom_val = validation::tensor_elem_at(utmp, utmp_bottom_coord, BorderMode::CONSTANT, border_value);
+
+            int result   = (utmp_top_val + utmp_bottom_val) / 2;
+            dst_pixel[0] = result;
+
+            int vtmp_top_val    = validation::tensor_elem_at(vtmp, utmp_top_coord, BorderMode::CONSTANT, border_value);
+            int vtmp_bottom_val = validation::tensor_elem_at(vtmp, utmp_bottom_coord, BorderMode::CONSTANT, border_value);
+
+            result       = (vtmp_top_val + vtmp_bottom_val) / 2;
+            dst_pixel[1] = result;
+
+            utmp_coord_x++;
+
+            if(utmp_coord_x >= utmp_width)
+            {
+                utmp_coord_x = 0;
+                utmp_coord_y += 2;
+            }
+        }
+    }
+}
+
+template <typename T>
+inline void colorconvert_rgb_to_iyuv(const SimpleTensor<T> src, std::vector<SimpleTensor<T>> &dst)
+{
+    SimpleTensor<T> rvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+    SimpleTensor<T> gvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+    SimpleTensor<T> bvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+    SimpleTensor<T> yvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+
+    int vec_shape_x = src.shape().x() * src.shape().y();
+
+    SimpleTensor<T> uvec_top(TensorShape{ vec_shape_x, 1U }, Format::U8);
+    SimpleTensor<T> uvec_bottom(TensorShape{ vec_shape_x, 1U }, Format::U8);
+    SimpleTensor<T> vvec_top(TensorShape{ vec_shape_x, 1U }, Format::U8);
+    SimpleTensor<T> vvec_bottom(TensorShape{ vec_shape_x, 1U }, Format::U8);
+
+    store_rgb_from_src(src, rvec, gvec, bvec);
+    rgb_to_yuv_calculation(rvec, gvec, bvec, dst[0], uvec_top, uvec_bottom, vvec_top, vvec_bottom);
+
+    SimpleTensor<T> utmp(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
+    SimpleTensor<T> vtmp(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
+    int             utmp_width  = utmp.shape().x();
+    int             utmp_height = utmp.shape().y();
+
+    int         uvec_coord_x = 0;
+    int         uvec_coord_y = 0;
+    Coordinates uvec_coord{ uvec_coord_x, uvec_coord_y };
+    for(int y = 0; y < utmp_height; y++)
+    {
+        for(int x = 0; x < utmp_width; x++)
+        {
+            Coordinates coord{ x, y };
+            auto       *utmp_pixel = reinterpret_cast<T *>(utmp(coord));
+            auto       *vtmp_pixel = reinterpret_cast<T *>(vtmp(coord));
+
+            T   border_value(0);
+            int uvec_top_val    = validation::tensor_elem_at(uvec_top, uvec_coord, BorderMode::CONSTANT, border_value);
+            int uvec_bottom_val = validation::tensor_elem_at(uvec_bottom, uvec_coord, BorderMode::CONSTANT, border_value);
+            int vvec_top_val    = validation::tensor_elem_at(vvec_top, uvec_coord, BorderMode::CONSTANT, border_value);
+            int vvec_bottom_val = validation::tensor_elem_at(vvec_bottom, uvec_coord, BorderMode::CONSTANT, border_value);
+
+            utmp_pixel[0] = std::ceil(float(uvec_top_val + uvec_bottom_val) / 2);
+            vtmp_pixel[0] = std::ceil(float(vvec_top_val + vvec_bottom_val) / 2);
+
+            uvec_coord.set(0, ++uvec_coord_x);
+        }
+    }
+
+    int second_plane_x = dst[1].shape().x();
+    int second_plane_y = dst[1].shape().y();
+
+    int utmp_coord_x = 0;
+    int utmp_coord_y = 0;
+
+    for(int y = 0; y < second_plane_y; y++)
+    {
+        for(int x = 0; x < second_plane_x; x++)
+        {
+            Coordinates coord{ x, y };
+            Coordinates utmp_top_coord{ utmp_coord_x, utmp_coord_y };
+            Coordinates utmp_bottom_coord{ utmp_coord_x, utmp_coord_y + 1 };
+
+            auto *u_pixel = reinterpret_cast<T *>(dst[1](coord));
+            auto *v_pixel = reinterpret_cast<T *>(dst[2](coord));
+
+            T   border_value(0);
+            int utmp_top_val    = validation::tensor_elem_at(utmp, utmp_top_coord, BorderMode::CONSTANT, border_value);
+            int utmp_bottom_val = validation::tensor_elem_at(utmp, utmp_bottom_coord, BorderMode::CONSTANT, border_value);
+
+            int result = (utmp_top_val + utmp_bottom_val) / 2;
+            u_pixel[0] = result;
+
+            int vtmp_top_val    = validation::tensor_elem_at(vtmp, utmp_top_coord, BorderMode::CONSTANT, border_value);
+            int vtmp_bottom_val = validation::tensor_elem_at(vtmp, utmp_bottom_coord, BorderMode::CONSTANT, border_value);
+
+            result     = (vtmp_top_val + vtmp_bottom_val) / 2;
+            v_pixel[0] = result;
+
+            utmp_coord_x++;
+
+            if(utmp_coord_x >= utmp_width)
+            {
+                utmp_coord_x = 0;
+                utmp_coord_y += 2;
+            }
+        }
+    }
+}
+
+template <typename T>
+inline void colorconvert_rgb_to_yuv4(const SimpleTensor<T> src, std::vector<SimpleTensor<T>> &dst)
+{
+    SimpleTensor<T> rvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+    SimpleTensor<T> gvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+    SimpleTensor<T> bvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+    SimpleTensor<T> yvec(TensorShape{ dst[0].shape().x(), dst[0].shape().y() }, Format::U8);
+
+    int vec_shape_x = src.shape().x() * src.shape().y();
+
+    SimpleTensor<T> uvec_top(TensorShape{ vec_shape_x, 1U }, Format::U8);
+    SimpleTensor<T> uvec_bottom(TensorShape{ vec_shape_x, 1U }, Format::U8);
+    SimpleTensor<T> vvec_top(TensorShape{ vec_shape_x, 1U }, Format::U8);
+    SimpleTensor<T> vvec_bottom(TensorShape{ vec_shape_x, 1U }, Format::U8);
+
+    int width  = src.shape().x();
+    int height = src.shape().y();
+
+    store_rgb_from_src(src, rvec, gvec, bvec);
+
+    rgb_to_yuv_calculation(rvec, gvec, bvec, dst[0], uvec_top, uvec_bottom, vvec_top, vvec_bottom);
+
+    int         uvec_coord_x = 0;
+    int         uvec_coord_y = 0;
+    Coordinates uvec_coord{ uvec_coord_x, uvec_coord_y };
+    for(int y = 0; y < height; y++)
+    {
+        for(int x = 0; x < width; x += 2)
+        {
+            Coordinates coord{ x, y };
+            auto       *plane_1_pixel = reinterpret_cast<T *>(dst[1](coord));
+            auto       *plane_2_pixel = reinterpret_cast<T *>(dst[2](coord));
+
+            T   border_value(0);
+            int uvec_top_val    = validation::tensor_elem_at(uvec_top, uvec_coord, BorderMode::CONSTANT, border_value);
+            int uvec_bottom_val = validation::tensor_elem_at(uvec_bottom, uvec_coord, BorderMode::CONSTANT, border_value);
+
+            plane_1_pixel[0] = uvec_top_val;
+            plane_1_pixel[1] = uvec_bottom_val;
+
+            int vvec_top_val    = validation::tensor_elem_at(vvec_top, uvec_coord, BorderMode::CONSTANT, border_value);
+            int vvec_bottom_val = validation::tensor_elem_at(vvec_bottom, uvec_coord, BorderMode::CONSTANT, border_value);
+
+            plane_2_pixel[0] = vvec_top_val;
+            plane_2_pixel[1] = vvec_bottom_val;
+
+            uvec_coord.set(0, ++uvec_coord_x);
+        }
+    }
+}
+
+template <typename T>
+inline void colorconvert_yuyv_to_nv12(const SimpleTensor<T> src, const Format format, std::vector<SimpleTensor<T>> &dst)
+{
+    SimpleTensor<T> uvvec_top(TensorShape{ dst[0].shape().x(), dst[0].shape().y() / 2 }, Format::U8);
+    SimpleTensor<T> uvvec_bottom(TensorShape{ dst[0].shape().x(), dst[0].shape().y() / 2 }, Format::U8);
+
+    const int offset = (Format::YUYV422 == format) ? 0 : 1;
+
+    int width  = dst[0].shape().x();
+    int height = dst[0].shape().y();
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; x++)
+        {
+            const Coordinates dst_coord{ x, y };
+            const Coordinates uv_coord{ x, y / 2 };
+
+            const auto *src_pixel          = reinterpret_cast<const T *>(src(dst_coord));
+            auto       *y_pixel            = reinterpret_cast<T *>(dst[0](dst_coord));
+            auto       *uvvec_top_pixel    = reinterpret_cast<T *>(uvvec_top(uv_coord));
+            auto       *uvvec_bottom_pixel = reinterpret_cast<T *>(uvvec_bottom(uv_coord));
+
+            y_pixel[0] = src_pixel[0 + offset];
+
+            if(y % 2 == 0)
+            {
+                uvvec_top_pixel[0] = src_pixel[1 - offset];
+            }
+            else
+            {
+                uvvec_bottom_pixel[0] = src_pixel[1 - offset];
+            }
+        }
+    }
+
+    width  = dst[1].shape().x();
+    height = dst[1].shape().y();
+
+    int uv_coord_x = 0;
+    int uv_coord_y = 0;
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; x++)
+        {
+            const Coordinates dst_coord{ x, y };
+            const Coordinates uv_coord{ uv_coord_x, uv_coord_y };
+
+            auto       *uv_pixel           = reinterpret_cast<T *>(dst[1](dst_coord));
+            const auto *uvvec_top_pixel    = reinterpret_cast<T *>(uvvec_top(uv_coord));
+            const auto *uvvec_bottom_pixel = reinterpret_cast<T *>(uvvec_bottom(uv_coord));
+
+            uv_pixel[0] = (uvvec_top_pixel[0] + uvvec_bottom_pixel[0]) / 2;
+            uv_pixel[1] = (uvvec_top_pixel[1] + uvvec_bottom_pixel[1]) / 2;
+            uv_coord_x += 2;
+        }
+        uv_coord_x = 0;
+        uv_coord_y++;
+    }
+}
+
+template <typename T>
+inline void colorconvert_yuyv_to_iyuv(const SimpleTensor<T> src, const Format format, std::vector<SimpleTensor<T>> &dst)
+{
+    SimpleTensor<T> uvvec_top(TensorShape{ dst[0].shape().x(), dst[0].shape().y() / 2 }, Format::U8);
+    SimpleTensor<T> uvvec_bottom(TensorShape{ dst[0].shape().x(), dst[0].shape().y() / 2 }, Format::U8);
+
+    const int offset = (Format::YUYV422 == format) ? 0 : 1;
+
+    int width  = dst[0].shape().x();
+    int height = dst[0].shape().y();
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; x++)
+        {
+            const Coordinates dst_coord{ x, y };
+            const Coordinates uv_coord{ x, y / 2 };
+
+            const auto *src_pixel          = reinterpret_cast<const T *>(src(dst_coord));
+            auto       *y_pixel            = reinterpret_cast<T *>(dst[0](dst_coord));
+            auto       *uvvec_top_pixel    = reinterpret_cast<T *>(uvvec_top(uv_coord));
+            auto       *uvvec_bottom_pixel = reinterpret_cast<T *>(uvvec_bottom(uv_coord));
+
+            y_pixel[0] = src_pixel[0 + offset];
+
+            if(y % 2 == 0)
+            {
+                uvvec_top_pixel[0] = src_pixel[1 - offset];
+            }
+            else
+            {
+                uvvec_bottom_pixel[0] = src_pixel[1 - offset];
+            }
+        }
+    }
+
+    width  = dst[1].shape().x();
+    height = dst[1].shape().y();
+
+    int uv_coord_x = 0;
+    int uv_coord_y = 0;
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; x++)
+        {
+            const Coordinates dst_coord{ x, y };
+            const Coordinates uv_coord{ uv_coord_x, uv_coord_y };
+
+            auto       *u_pixel            = reinterpret_cast<T *>(dst[1](dst_coord));
+            auto       *v_pixel            = reinterpret_cast<T *>(dst[2](dst_coord));
+            const auto *uvvec_top_pixel    = reinterpret_cast<T *>(uvvec_top(uv_coord));
+            const auto *uvvec_bottom_pixel = reinterpret_cast<T *>(uvvec_bottom(uv_coord));
+
+            u_pixel[0] = (uvvec_top_pixel[0] + uvvec_bottom_pixel[0]) / 2;
+            v_pixel[0] = (uvvec_top_pixel[1] + uvvec_bottom_pixel[1]) / 2;
+            uv_coord_x += 2;
+        }
+        uv_coord_x = 0;
+        uv_coord_y++;
+    }
+}
+
+template <typename T>
+inline void nv_to_iyuv(const SimpleTensor<T> src, const Format src_format, SimpleTensor<T> &nv1, SimpleTensor<T> &nv2)
+{
+    int width  = src.shape().x();
+    int height = src.shape().y();
+
+    const int offset = (Format::NV12 == src_format) ? 1 : 0;
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; x++)
+        {
+            const Coordinates src_coord{ x, y };
+            const auto       *src_pixel = reinterpret_cast<const T *>(src(src_coord));
+            auto             *u_pixel   = reinterpret_cast<T *>(nv1(src_coord));
+            auto             *v_pixel   = reinterpret_cast<T *>(nv2(src_coord));
+
+            u_pixel[0] = src_pixel[1 - offset];
+            v_pixel[0] = src_pixel[0 + offset];
+        }
+    }
+}
+
+template <typename T>
+inline void nv_to_yuv4(const SimpleTensor<T> src, const Format src_format, SimpleTensor<T> &nv1, SimpleTensor<T> &nv2)
+{
+    int width  = src.shape().x();
+    int height = src.shape().y();
+
+    const int offset = (Format::NV12 == src_format) ? 1 : 0;
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; x++)
+        {
+            const Coordinates src_coord{ x, y };
+            Coordinates       dst_coord{ x * 2, y * 2 };
+            const auto       *src_pixel = reinterpret_cast<const T *>(src(src_coord));
+            auto             *u_pixel   = reinterpret_cast<T *>(nv1(dst_coord));
+            auto             *v_pixel   = reinterpret_cast<T *>(nv2(dst_coord));
+
+            u_pixel[0] = src_pixel[1 - offset];
+            u_pixel[1] = src_pixel[1 - offset];
+
+            v_pixel[0] = src_pixel[0 + offset];
+            v_pixel[1] = src_pixel[0 + offset];
+
+            dst_coord.set(1, y * 2 + 1);
+            u_pixel    = reinterpret_cast<T *>(nv1(dst_coord));
+            v_pixel    = reinterpret_cast<T *>(nv2(dst_coord));
+            u_pixel[0] = src_pixel[1 - offset];
+            u_pixel[1] = src_pixel[1 - offset];
+
+            v_pixel[0] = src_pixel[0 + offset];
+            v_pixel[1] = src_pixel[0 + offset];
+        }
+    }
+}
+
+template <typename T>
+inline void colorconvert_nv_to_iyuv(const std::vector<SimpleTensor<T>> src, const Format src_format, std::vector<SimpleTensor<T>> &dst)
+{
+    int width  = dst[0].shape().x();
+    int height = dst[0].shape().y();
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; ++x)
+        {
+            const Coordinates dst_coord{ x, y };
+
+            const auto *src_pixel = reinterpret_cast<const T *>(src[0](dst_coord));
+            auto       *y_pixel   = reinterpret_cast<T *>(dst[0](dst_coord));
+
+            y_pixel[0] = src_pixel[0];
+        }
+    }
+
+    nv_to_iyuv(src[1], src_format, dst[1], dst[2]);
+}
+
+template <typename T>
+inline void colorconvert_nv_to_yuv4(const std::vector<SimpleTensor<T>> src, const Format src_format, std::vector<SimpleTensor<T>> &dst)
+{
+    int width  = dst[0].shape().x();
+    int height = dst[0].shape().y();
+
+    for(int y = 0; y < height; ++y)
+    {
+        for(int x = 0; x < width; ++x)
+        {
+            const Coordinates dst_coord{ x, y };
+
+            const auto *src_pixel = reinterpret_cast<const T *>(src[0](dst_coord));
+            auto       *y_pixel   = reinterpret_cast<T *>(dst[0](dst_coord));
+
+            y_pixel[0] = src_pixel[0];
+        }
+    }
+
+    nv_to_yuv4(src[1], src_format, dst[1], dst[2]);
+}
+
+} // namespace detail
+} // color_convert_helper
+} // namespace test
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_TEST_VALIDATION_COLOR_CONVERT_H__ */
diff --git a/tests/validation/reference/ConvertFullyConnectedWeights.cpp b/tests/validation/reference/ConvertFullyConnectedWeights.cpp
index b0f537f..e27846c 100644
--- a/tests/validation/reference/ConvertFullyConnectedWeights.cpp
+++ b/tests/validation/reference/ConvertFullyConnectedWeights.cpp
@@ -36,9 +36,15 @@
 {
     SimpleTensor<T> dst(src.shape(), src.data_type());
 
+    const DataLayout original_input_data_layout = (training_data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
+
+    const int width_idx   = get_data_layout_dimension_index(original_input_data_layout, DataLayoutDimension::WIDTH);
+    const int height_idx  = get_data_layout_dimension_index(original_input_data_layout, DataLayoutDimension::HEIGHT);
+    const int channel_idx = get_data_layout_dimension_index(original_input_data_layout, DataLayoutDimension::CHANNEL);
+
     const bool         is_nchw_to_nhwc           = training_data_layout == DataLayout::NCHW;
-    const unsigned int num_elems_per_input_plane = original_input_shape.x() * original_input_shape.y();
-    const unsigned int num_channels              = original_input_shape.z();
+    const unsigned int num_elems_per_input_plane = original_input_shape[width_idx] * original_input_shape[height_idx];
+    const unsigned int num_channels              = original_input_shape[channel_idx];
     const unsigned int factor_1                  = is_nchw_to_nhwc ? num_elems_per_input_plane : num_channels;
     const unsigned int factor_2                  = is_nchw_to_nhwc ? num_channels : num_elems_per_input_plane;
 
diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h
index 7001758..2e5fefd 100644
--- a/tests/validation/reference/Convolution3d.h
+++ b/tests/validation/reference/Convolution3d.h
@@ -25,7 +25,6 @@
 #define __ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H__
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
 
@@ -91,74 +90,16 @@
     *out_ptr = acc + (*b_ptr);
 }
 
-// 3D convolution for fixed point type
-template < typename T, typename TB, typename std::enable_if < std::is_integral<T>::value &&std::is_integral<TB>::value, int >::type = 0 >
+// 3D convolution for QASYMM8 type
+template < typename T, typename TB, typename std::enable_if < std::is_same<T, uint8_t>::value &&std::is_same<TB, int32_t>::value, int >::type = 0 >
 inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
                           int i_offset, int w_offset, int b_offset, int o_offset,
                           int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1)
 {
-    const T *in_ptr               = in.data() + i_offset;
-    const T *w_ptr                = weights.data() + w_offset;
-    const T *b_ptr                = bias.data() + b_offset;
-    T       *out_ptr              = out.data() + o_offset;
-    int      fixed_point_position = in.fixed_point_position();
-
-    const int half_width_weights_start  = width_weights / 2;
-    const int half_width_weights_end    = ((width_weights % 2) == 0) ? (half_width_weights_start - 1) : half_width_weights_start;
-    const int half_height_weights_start = height_weights / 2;
-    const int half_height_weights_end   = ((height_weights % 2) == 0) ? (half_height_weights_start - 1) : half_height_weights_start;
-
-    using namespace fixed_point_arithmetic;
-    using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
-
-    // Reset accumulator
-    fixed_point<promoted_type> acc(0, fixed_point_position);
-
-    // Compute a 2D convolution for each IFM and accumulate the result
-    for(int ifm = 0; ifm < depth_in; ++ifm)
-    {
-        // Compute the offset for the input slice
-        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
-
-        // Compute 2D convolution
-        for(int yk = -half_height_weights_start; yk <= half_height_weights_end; ++yk)
-        {
-            for(int xk = -half_width_weights_start; xk <= half_width_weights_end; ++xk)
-            {
-                // Check if the pixel is out-of-bound
-                if(is_valid_pixel(xi + xk * dilation_x, 0, width_in) && is_valid_pixel(yi + yk * dilation_y, 0, height_in))
-                {
-                    const int idx = xk + half_width_weights_start;
-                    const int idy = yk + half_height_weights_start;
-
-                    const fixed_point<promoted_type> i_value(in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in], fixed_point_position, true);
-                    const fixed_point<promoted_type> w_value(w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights], fixed_point_position, true);
-                    const fixed_point<promoted_type> iw = i_value * w_value;
-                    acc                                 = iw + acc;
-                }
-            }
-        }
-    }
-
-    // Get the bias
-    const fixed_point<promoted_type> b(*b_ptr, fixed_point_position, true);
-
-    // Accumulate the bias and covert back
-    acc = acc + b;
-    fixed_point<T> res(acc);
-    *out_ptr = res.raw();
-}
-
-// 3D convolution for QASYMM8 type
-template <>
-inline void convolution3d(const SimpleTensor<uint8_t> &in, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<uint8_t> &out,
-                          int i_offset, int w_offset, int b_offset, int o_offset,
-                          int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x, int dilation_y)
-{
-    const uint8_t *in_ptr  = in.data() + i_offset;
-    const uint8_t *w_ptr   = weights.data() + w_offset;
-    const int32_t *b_ptr   = bias.data() + b_offset;
-    uint8_t       *out_ptr = out.data() + o_offset;
+    const T *in_ptr  = in.data() + i_offset;
+    const T *w_ptr   = weights.data() + w_offset;
+    const TB *b_ptr   = bias.data() + b_offset;
+    T        *out_ptr = out.data() + o_offset;
 
     const int   input_offset   = -in.quantization_info().offset;
     const float input_scale    = in.quantization_info().scale;
diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index fe558ba..7dbdba9 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp
@@ -23,7 +23,6 @@
  */
 #include "ConvolutionLayer.h"
 
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/Convolution3d.h"
 #include "tests/validation/reference/Permute.h"
@@ -48,8 +47,10 @@
 
 template <typename T, typename TB>
 SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, const PadStrideInfo &info,
-                                       const Size2D &dilation)
+                                       const Size2D &dilation, unsigned int num_groups)
 {
+    ARM_COMPUTE_ERROR_ON((src.shape()[2] / num_groups) != weights.shape()[2]);
+
     // Compute reference
     const int width_in       = src.shape().x();
     const int height_in      = src.shape().y();
@@ -79,23 +80,28 @@
         {
             for(int xi = start_xi; xi < start_xi + end_xi; xi += stride_xi)
             {
-                for(int ofm = 0; ofm < depth_out; ++ofm)
+                for(int group = 0; group < static_cast<int>(num_groups); ++group)
                 {
-                    // Compute input and output offsets
-                    const int offset_in  = r * width_in * height_in * depth_in;
-                    const int xo         = (xi - start_xi) / stride_xi;
-                    const int yo         = (yi - start_yi) / stride_yi;
-                    const int offset_out = xo + yo * width_out + ofm * width_out * height_out + r * width_out * height_out * depth_out;
+                    for(int ofm = 0; ofm < static_cast<int>(depth_out / num_groups); ++ofm)
+                    {
+                        // Compute input and output offsets
+                        const int offset_in  = r * width_in * height_in * depth_in + (group * (depth_in / num_groups) * width_in * height_in);
+                        const int xo         = (xi - start_xi) / stride_xi;
+                        const int yo         = (yi - start_yi) / stride_yi;
+                        const int offset_out = xo + yo * width_out + ((ofm + group * (depth_out / num_groups)) * width_out * height_out) + (r * width_out * height_out * depth_out);
+                        const int offset_w   = (ofm + group * (depth_out / num_groups)) * width_weights * height_weights * depth_weights;
+                        const int offset_b   = (ofm + group * (depth_out / num_groups));
 
-                    ARM_COMPUTE_ASSERT(xo < width_out);
-                    ARM_COMPUTE_ASSERT(yo < height_out);
+                        ARM_COMPUTE_ASSERT(xo < width_out);
+                        ARM_COMPUTE_ASSERT(yo < height_out);
 
-                    // Compute 3D convolution
-                    convolution_3d::detail::convolution3d(src, weights, bias, dst,
-                                                          offset_in, ofm * width_weights * height_weights * depth_weights, ofm, offset_out,
-                                                          xi, yi,
-                                                          width_in, height_in, depth_in,
-                                                          width_weights, height_weights, dilation.x(), dilation.y());
+                        // Compute 3D convolution
+                        convolution_3d::detail::convolution3d(src, weights, bias, dst,
+                                                              offset_in, offset_w, offset_b, offset_out,
+                                                              xi, yi,
+                                                              width_in, height_in, (depth_in / num_groups),
+                                                              width_weights, height_weights, dilation.x(), dilation.y());
+                    }
                 }
             }
         }
@@ -105,10 +111,10 @@
 }
 template <typename T, typename TB>
 SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
-                                  const Size2D &dilation)
+                                  const Size2D &dilation, unsigned int num_groups)
 {
     // Create reference
-    SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
+    SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.quantization_info() };
 
     if(src.data_layout() == DataLayout::NHWC)
     {
@@ -116,24 +122,20 @@
         SimpleTensor<T> weights_nchw = reference::permute<T>(weights, PermutationVector(1U, 2U, 0U));
         SimpleTensor<T> dst_nchw     = reference::permute<T>(dst, PermutationVector(1U, 2U, 0U));
 
-        return reference::permute<T>(convolution_layer_nchw(src_nchw, weights_nchw, bias, dst_nchw, info, dilation), PermutationVector(2U, 0U, 1U));
+        return reference::permute<T>(convolution_layer_nchw(src_nchw, weights_nchw, bias, dst_nchw, info, dilation, num_groups), PermutationVector(2U, 0U, 1U));
     }
     else
     {
-        return convolution_layer_nchw(src, weights, bias, dst, info, dilation);
+        return convolution_layer_nchw(src, weights, bias, dst, info, dilation, num_groups);
     }
 }
 
 template SimpleTensor<float> convolution_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
-                                               const PadStrideInfo &info, const Size2D &dilation);
+                                               const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups);
 template SimpleTensor<half> convolution_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, const TensorShape &output_shape,
-                                              const PadStrideInfo &info, const Size2D &dilation);
-template SimpleTensor<qint8_t> convolution_layer(const SimpleTensor<qint8_t> &src, const SimpleTensor<qint8_t> &weights, const SimpleTensor<qint8_t> &bias, const TensorShape &output_shape,
-                                                 const PadStrideInfo &info, const Size2D &dilation);
-template SimpleTensor<qint16_t> convolution_layer(const SimpleTensor<qint16_t> &src, const SimpleTensor<qint16_t> &weights, const SimpleTensor<qint16_t> &bias, const TensorShape &output_shape,
-                                                  const PadStrideInfo &info, const Size2D &dilation);
+                                              const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups);
 template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
-                                                 const PadStrideInfo &info, const Size2D &dilation);
+                                                 const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ConvolutionLayer.h b/tests/validation/reference/ConvolutionLayer.h
index ff3b153..ccce53a 100644
--- a/tests/validation/reference/ConvolutionLayer.h
+++ b/tests/validation/reference/ConvolutionLayer.h
@@ -37,7 +37,7 @@
 {
 template <typename T, typename TB>
 SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
-                                  const Size2D &dilation = Size2D(1U, 1U));
+                                  const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DeconvolutionLayer.cpp b/tests/validation/reference/DeconvolutionLayer.cpp
index 617f690..e73023e 100644
--- a/tests/validation/reference/DeconvolutionLayer.cpp
+++ b/tests/validation/reference/DeconvolutionLayer.cpp
@@ -23,7 +23,6 @@
  */
 #include "ConvolutionLayer.h"
 
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -46,7 +45,7 @@
     int         out_y        = src.shape().y() + (src.shape().y() - 1) * (stride_y - 1) + a.second + 2 * info.pad().second;
     scaled_shape.set(0, out_x);
     scaled_shape.set(1, out_y);
-    SimpleTensor<T> scaled{ scaled_shape, src.data_type(), 1, src.fixed_point_position() };
+    SimpleTensor<T> scaled{ scaled_shape, src.data_type(), 1 };
 
     const int width_in      = src.shape().x();
     const int height_in     = src.shape().y();
@@ -91,6 +90,8 @@
 
 template SimpleTensor<float> deconvolution_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
                                                  const PadStrideInfo &info, const std::pair<unsigned int, unsigned int> &a);
+template SimpleTensor<half> deconvolution_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, const TensorShape &output_shape,
+                                                const PadStrideInfo &info, const std::pair<unsigned int, unsigned int> &a);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DepthConcatenateLayer.cpp b/tests/validation/reference/DepthConcatenateLayer.cpp
index 9a72484..90fbd91 100644
--- a/tests/validation/reference/DepthConcatenateLayer.cpp
+++ b/tests/validation/reference/DepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "DepthConcatenateLayer.h"
 
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -93,10 +92,9 @@
     return dst;
 }
 
+template SimpleTensor<uint8_t> depthconcatenate_layer(const std::vector<SimpleTensor<uint8_t>> &srcs);
 template SimpleTensor<float> depthconcatenate_layer(const std::vector<SimpleTensor<float>> &srcs);
 template SimpleTensor<half> depthconcatenate_layer(const std::vector<SimpleTensor<half>> &srcs);
-template SimpleTensor<qint8_t> depthconcatenate_layer(const std::vector<SimpleTensor<qint8_t>> &srcs);
-template SimpleTensor<qint16_t> depthconcatenate_layer(const std::vector<SimpleTensor<qint16_t>> &srcs);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DepthConvertLayer.cpp b/tests/validation/reference/DepthConvertLayer.cpp
index dd095b8..fd2e0ae 100644
--- a/tests/validation/reference/DepthConvertLayer.cpp
+++ b/tests/validation/reference/DepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "DepthConvertLayer.h"
 
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 #include "tests/Types.h"
@@ -36,44 +35,6 @@
 {
 namespace reference
 {
-template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_floating_point<T2>::value, int >::type >
-SimpleTensor<T2> depth_convert(const SimpleTensor<T1> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift)
-{
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_UNUSED(shift);
-
-    using namespace fixed_point_arithmetic;
-    SimpleTensor<T2> result(src.shape(), dt_out);
-
-    const int fixed_point_position = src.fixed_point_position();
-
-    for(int i = 0; i < src.num_elements(); ++i)
-    {
-        result[i] = static_cast<float>(fixed_point<T1>(src[i], fixed_point_position, true));
-    }
-
-    return result;
-}
-
-template < typename T1, typename T2, typename std::enable_if < std::is_floating_point<T1>::value &&std::is_integral<T2>::value, int >::type >
-SimpleTensor<T2> depth_convert(const SimpleTensor<T1> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift)
-{
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_UNUSED(shift);
-
-    using namespace fixed_point_arithmetic;
-    SimpleTensor<T2> result(src.shape(), dt_out, 1, src.fixed_point_position());
-
-    const int fixed_point_position = result.fixed_point_position();
-
-    for(int i = 0; i < src.num_elements(); ++i)
-    {
-        result[i] = fixed_point<T2>(src[i], fixed_point_position).raw();
-    }
-
-    return result;
-}
-
 template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_integral<T2>::value &&!std::is_same<T1, T2>::value, int >::type >
 SimpleTensor<T2> depth_convert(const SimpleTensor<T1> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift)
 {
@@ -99,45 +60,31 @@
     return result;
 }
 
-template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_integral<T2>::value &&std::is_same<T1, T2>::value, int >::type >
+template < typename T1, typename T2, typename std::enable_if < is_floating_point<T1>::value &&is_floating_point<T2>::value &&!std::is_same<T1, T2>::value, int >::type >
 SimpleTensor<T2> depth_convert(const SimpleTensor<T1> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_UNUSED(policy);
-
-    using namespace fixed_point_arithmetic;
-
     SimpleTensor<T2> result(src.shape(), dt_out);
 
-    bool is_in_place = (&src == &result);
+    const uint32_t scale = 1 << shift;
 
-    const int fixed_point_position_in  = src.fixed_point_position();
-    const int fixed_point_position_out = (is_in_place) ? static_cast<int>(shift) : result.fixed_point_position();
-
-    if(!is_in_place || (fixed_point_position_in != fixed_point_position_out))
+    // Up-casting
+    if(src.data_type() <= dt_out)
     {
         for(int i = 0; i < src.num_elements(); ++i)
         {
-            auto x = fixed_point<T2>(src[i], fixed_point_position_in, true);
-            x.resacle(fixed_point_position_out);
-            result[i] = x.raw();
+            result[i] = src[i] * static_cast<T2>(scale);
         }
     }
-
-    return result;
-}
-
-template < typename T1, typename T2, typename std::enable_if < std::is_floating_point<T1>::value &&is_floating_point<T2>::value, int >::type >
-SimpleTensor<T2> depth_convert(const SimpleTensor<T1> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift)
-{
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_UNUSED(shift);
-
-    SimpleTensor<T2> result(src.shape(), dt_out);
-
-    for(int i = 0; i < src.num_elements(); ++i)
+    // Down-casting
+    else
     {
-        result[i] = static_cast<T2>(src[i]);
+        for(int i = 0; i < src.num_elements(); ++i)
+        {
+            T1 val    = src[i] / static_cast<T1>(scale);
+            result[i] = (policy == ConvertPolicy::SATURATE) ? saturate_cast<T2>(val) : static_cast<T2>(val);
+        }
     }
+    return result;
 }
 
 template SimpleTensor<uint16_t> depth_convert(const SimpleTensor<uint8_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
@@ -147,10 +94,8 @@
 template SimpleTensor<uint32_t> depth_convert(const SimpleTensor<uint16_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 template SimpleTensor<uint8_t> depth_convert(const SimpleTensor<int16_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 template SimpleTensor<int32_t> depth_convert(const SimpleTensor<int16_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
-template SimpleTensor<float> depth_convert(const SimpleTensor<int8_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
-template SimpleTensor<float> depth_convert(const SimpleTensor<int16_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
-template SimpleTensor<int8_t> depth_convert(const SimpleTensor<float> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
-template SimpleTensor<int16_t> depth_convert(const SimpleTensor<float> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<half> depth_convert(const SimpleTensor<float> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<float> depth_convert(const SimpleTensor<half> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DepthConvertLayer.h b/tests/validation/reference/DepthConvertLayer.h
index 1446bfd..5d97c73 100644
--- a/tests/validation/reference/DepthConvertLayer.h
+++ b/tests/validation/reference/DepthConvertLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,19 +35,10 @@
 {
 namespace reference
 {
-template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_floating_point<T2>::value, int >::type = 0 >
-SimpleTensor<T2> depth_convert(const SimpleTensor<T1> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
-
-template < typename T1, typename T2, typename std::enable_if < std::is_floating_point<T1>::value &&std::is_integral<T2>::value, int >::type = 0 >
-SimpleTensor<T2> depth_convert(const SimpleTensor<T1> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
-
 template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_integral<T2>::value &&!std::is_same<T1, T2>::value, int >::type = 0 >
 SimpleTensor<T2> depth_convert(const SimpleTensor<T1> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 
-template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_integral<T2>::value &&std::is_same<T1, T2>::value, int >::type = 0 >
-SimpleTensor<T2> depth_convert(const SimpleTensor<T1> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
-
-template < typename T1, typename T2, typename std::enable_if < std::is_floating_point<T1>::value &&is_floating_point<T2>::value, int >::type = 0 >
+template < typename T1, typename T2, typename std::enable_if < is_floating_point<T1>::value &&is_floating_point<T2>::value &&!std::is_same<T1, T2>::value, int >::type = 0 >
 SimpleTensor<T2> depth_convert(const SimpleTensor<T1> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.cpp b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
index 10c617e..39429e2 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
@@ -26,7 +26,6 @@
 #include "ConvolutionLayer.h"
 #include "Utils.h"
 
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/Utils.h"
 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
@@ -53,7 +52,7 @@
 SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info,
                                       unsigned int depth_multiplier)
 {
-    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position() };
+    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1 };
 
     // Compute reference
     const int filter_width  = weights.shape().x();
@@ -122,7 +121,7 @@
 SimpleTensor<uint8_t> depthwise_convolution(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &biases, const TensorShape &dst_shape,
                                             const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
-    SimpleTensor<uint8_t> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
+    SimpleTensor<uint8_t> dst{ dst_shape, src.data_type(), 1, src.quantization_info() };
 
     // Create reference
     const int   input_offset   = -src.quantization_info().offset;
diff --git a/tests/validation/reference/FixedPoint.cpp b/tests/validation/reference/FixedPoint.cpp
deleted file mode 100644
index a016093..0000000
--- a/tests/validation/reference/FixedPoint.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "FixedPoint.h"
-
-#include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
-#include "tests/validation/Helpers.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> fixed_point_operation(const SimpleTensor<T> &src, FixedPointOp op)
-{
-    SimpleTensor<T> result(src.shape(), src.data_type());
-
-    const int p = src.fixed_point_position();
-    switch(op)
-    {
-        case FixedPointOp::EXP:
-            for(int i = 0; i < src.num_elements(); ++i)
-            {
-                result[i] = fixed_point_arithmetic::exp(fixed_point_arithmetic::fixed_point<T>(src[i], p, true)).raw();
-            }
-            break;
-        case FixedPointOp::LOG:
-            for(int i = 0; i < src.num_elements(); ++i)
-            {
-                result[i] = fixed_point_arithmetic::log(fixed_point_arithmetic::fixed_point<T>(src[i], p, true)).raw();
-            }
-            break;
-        case FixedPointOp::INV_SQRT:
-            for(int i = 0; i < src.num_elements(); ++i)
-            {
-                result[i] = fixed_point_arithmetic::inv_sqrt(fixed_point_arithmetic::fixed_point<T>(src[i], p, true)).raw();
-            }
-            break;
-        case FixedPointOp::RECIPROCAL:
-            for(int i = 0; i < src.num_elements(); ++i)
-            {
-                result[i] = fixed_point_arithmetic::div(fixed_point_arithmetic::fixed_point<T>(1, p), fixed_point_arithmetic::fixed_point<T>(src[i], p, true)).raw();
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Fixed point operation not supported");
-            break;
-    }
-
-    return result;
-}
-
-template SimpleTensor<int8_t> fixed_point_operation(const SimpleTensor<int8_t> &src, FixedPointOp op);
-template SimpleTensor<int16_t> fixed_point_operation(const SimpleTensor<int16_t> &src, FixedPointOp op);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/FixedPointPixelWiseMultiplication.cpp b/tests/validation/reference/FixedPointPixelWiseMultiplication.cpp
deleted file mode 100644
index 636919b..0000000
--- a/tests/validation/reference/FixedPointPixelWiseMultiplication.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * dst OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "FixedPointPixelWiseMultiplication.h"
-
-#include "tests/validation/FixedPoint.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T>
-SimpleTensor<T> fixed_point_pixel_wise_multiplication(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, float scale, ConvertPolicy convert_policy)
-{
-    using namespace fixed_point_arithmetic;
-
-    SimpleTensor<T> dst(src2.shape(), src2.data_type(), 1, src2.fixed_point_position());
-
-    const int fixed_point_position = src1.fixed_point_position();
-
-    ARM_COMPUTE_ERROR_ON_MSG(src1.data_type() != src2.data_type() || src1.data_type() != dst.data_type(),
-                             "Tensors must all have the same DataType");
-    ARM_COMPUTE_ERROR_ON_MSG(fixed_point_position != src2.fixed_point_position() || fixed_point_position != dst.fixed_point_position(),
-                             "Fixed-point position must be the same for both inputs and outputs");
-
-    // Validate fixed_point_position
-    ARM_COMPUTE_ERROR_ON((src1.data_type() == DataType::QS8) && (fixed_point_position == 0 || fixed_point_position > 7));
-    ARM_COMPUTE_ERROR_ON((src1.data_type() == DataType::QS16) && (fixed_point_position == 0 || fixed_point_position > 15));
-
-    const fixed_point<T> fp_scale(scale, fixed_point_position);
-    const bool           is_sat = convert_policy == ConvertPolicy::SATURATE;
-
-    for(int i = 0; i < src1.num_elements(); ++i)
-    {
-        const fixed_point<T> val1(src1[i], fixed_point_position, true);
-        fixed_point<T>       res(src2[i], fixed_point_position, true);
-        if(is_sat)
-        {
-            res = mul(mul(res, val1), fp_scale);
-        }
-        else
-        {
-            res = mul<OverflowPolicy::WRAP>(mul<OverflowPolicy::WRAP>(res, val1), fp_scale);
-        }
-        dst[i] = res.raw();
-    }
-
-    return dst;
-}
-
-// *INDENT-OFF*
-// clang-format off
-template SimpleTensor<qint8_t> fixed_point_pixel_wise_multiplication(const SimpleTensor<qint8_t> &src1, const SimpleTensor<qint8_t> &src2, float scale, ConvertPolicy convert_policy);
-template SimpleTensor<qint16_t> fixed_point_pixel_wise_multiplication(const SimpleTensor<qint16_t> &src1, const SimpleTensor<qint16_t> &src2, float scale, ConvertPolicy convert_policy);
-// *INDENT-ON*
-// clang-format on
-
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/FlattenLayer.cpp b/tests/validation/reference/FlattenLayer.cpp
index 44f4d93..381ce37 100644
--- a/tests/validation/reference/FlattenLayer.cpp
+++ b/tests/validation/reference/FlattenLayer.cpp
@@ -23,8 +23,6 @@
  */
 #include "FlattenLayer.h"
 
-#include "tests/validation/FixedPoint.h"
-
 namespace arm_compute
 {
 namespace test
@@ -36,7 +34,7 @@
 template <typename T>
 SimpleTensor<T> flatten_layer(const SimpleTensor<T> &src, const TensorShape &shape_flatten)
 {
-    SimpleTensor<T> dst(shape_flatten, src.data_type(), 1, src.fixed_point_position());
+    SimpleTensor<T> dst(shape_flatten, src.data_type(), 1);
 
     // Note: Since the reference implementation does not use padding bytes, we can copy directly the content of the source tensor
     std::copy(src.data(), src.data() + src.num_elements(), dst.data());
@@ -46,8 +44,6 @@
 
 template SimpleTensor<float> flatten_layer(const SimpleTensor<float> &src, const TensorShape &shape_flatten);
 template SimpleTensor<half> flatten_layer(const SimpleTensor<half> &src, const TensorShape &shape_flatten);
-template SimpleTensor<qint8_t> flatten_layer(const SimpleTensor<qint8_t> &src, const TensorShape &shape_flatten);
-template SimpleTensor<qint16_t> flatten_layer(const SimpleTensor<qint16_t> &src, const TensorShape &shape_flatten);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/FullyConnectedLayer.cpp b/tests/validation/reference/FullyConnectedLayer.cpp
index 5384715..d65d0ca 100644
--- a/tests/validation/reference/FullyConnectedLayer.cpp
+++ b/tests/validation/reference/FullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "FullyConnectedLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
@@ -44,10 +43,8 @@
 // Vector matrix multiply for floating point
 template < typename T, typename TB, typename std::enable_if < is_floating_point<T>::value &&is_floating_point<TB>::value, int >::type = 0 >
 void vector_matrix_multiply(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, int offset_src, int offset_dst, int cols_weights,
-                            int rows_weights, uint8_t fixed_point_position)
+                            int rows_weights)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
     const T *src_ptr     = src.data() + offset_src;
     const T *weights_ptr = weights.data();
     const TB *bias_ptr    = bias.data();
@@ -60,57 +57,16 @@
     }
 }
 
-// Vector matrix multiply for fixed point type
-template < typename T, typename TB, typename std::enable_if < std::is_integral<T>::value &&std::is_integral<TB>::value, int >::type = 0 >
-void vector_matrix_multiply(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, int offset_src, int offset_dst, int cols_weights,
-                            int rows_weights, uint8_t fixed_point_position)
+// Vector matrix multiply for quantized type
+template < typename T, typename TB, typename std::enable_if < std::is_same<T, uint8_t>::value &&std::is_same<TB, int32_t>::value, int >::type = 0 >
+void vector_matrix_multiply(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, int offset_src, int offset_dst,
+                            int cols_weights, int rows_weights)
 {
     const T *src_ptr     = src.data() + offset_src;
     const T *weights_ptr = weights.data();
     const TB *bias_ptr    = bias.data();
     T        *dst_ptr     = dst.data() + offset_dst;
 
-    using namespace fixed_point_arithmetic;
-    using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
-
-    for(int y = 0; y < rows_weights; ++y)
-    {
-        // Reset accumulator
-        fixed_point<promoted_type> acc(0, fixed_point_position);
-
-        for(int x = 0; x < cols_weights; ++x)
-        {
-            const fixed_point<promoted_type> i_value(src_ptr[x], fixed_point_position, true);
-            const fixed_point<promoted_type> w_value(weights_ptr[x], fixed_point_position, true);
-            acc = acc + i_value * w_value;
-        }
-
-        // Get the bias
-        const fixed_point<T> b(bias_ptr[y], fixed_point_position, true);
-
-        // Convert back and accumulate the bias
-        fixed_point<T> res(acc);
-        res = res + b;
-
-        // Store the result
-        dst_ptr[y] = res.raw();
-
-        weights_ptr += cols_weights;
-    }
-}
-
-// Vector matrix multiply for quantized type
-template <>
-void vector_matrix_multiply(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, SimpleTensor<uint8_t> &dst, int offset_src, int offset_dst,
-                            int cols_weights, int rows_weights, uint8_t fixed_point_position)
-{
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
-    const uint8_t *src_ptr     = src.data() + offset_src;
-    const uint8_t *weights_ptr = weights.data();
-    const int32_t *bias_ptr    = bias.data();
-    uint8_t       *dst_ptr     = dst.data() + offset_dst;
-
     const int   input_offset   = -src.quantization_info().offset;
     const float input_scale    = src.quantization_info().scale;
     const int   weights_offset = -weights.quantization_info().offset;
@@ -141,7 +97,7 @@
         acc = utility::clamp<int32_t>(acc, 0, 255);
 
         // Store the result
-        dst_ptr[y] = static_cast<uint8_t>(acc);
+        dst_ptr[y] = static_cast<T>(acc);
 
         weights_ptr += cols_weights;
     }
@@ -152,7 +108,7 @@
 SimpleTensor<T> fully_connected_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &dst_shape)
 {
     // Create reference
-    SimpleTensor<T> dst{ TensorShape{ dst_shape }, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
+    SimpleTensor<T> dst{ TensorShape{ dst_shape }, src.data_type(), 1, src.quantization_info() };
 
     // Sanity checks
     const int          num_batch_dimensions = std::max(0, static_cast<int>(dst_shape.num_dimensions()) - 1);
@@ -183,8 +139,7 @@
                                   offset_in,
                                   offset_out,
                                   cols_weights,
-                                  rows_weights,
-                                  src.fixed_point_position());
+                                  rows_weights);
     }
 
     return dst;
@@ -192,8 +147,6 @@
 
 template SimpleTensor<float> fully_connected_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &dst_shape);
 template SimpleTensor<half> fully_connected_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, const TensorShape &dst_shape);
-template SimpleTensor<qint8_t> fully_connected_layer(const SimpleTensor<qint8_t> &src, const SimpleTensor<qint8_t> &weights, const SimpleTensor<qint8_t> &bias, const TensorShape &dst_shape);
-template SimpleTensor<qint16_t> fully_connected_layer(const SimpleTensor<qint16_t> &src, const SimpleTensor<qint16_t> &weights, const SimpleTensor<qint16_t> &bias, const TensorShape &dst_shape);
 template SimpleTensor<uint8_t> fully_connected_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &dst_shape);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp
index f9dcfcb..2feab89 100644
--- a/tests/validation/reference/GEMM.cpp
+++ b/tests/validation/reference/GEMM.cpp
@@ -24,7 +24,6 @@
 #include "GEMM.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 
 namespace arm_compute
 {
@@ -38,7 +37,7 @@
 SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta)
 {
     // Create reference
-    SimpleTensor<T> dst{ c.shape(), c.data_type(), 1, c.fixed_point_position() };
+    SimpleTensor<T> dst{ c.shape(), c.data_type(), 1 };
 
     // Compute reference
     const int M = a.shape().y();
@@ -85,79 +84,8 @@
     return dst;
 }
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type>
-SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta)
-{
-    using namespace fixed_point_arithmetic;
-
-    // Create reference
-    SimpleTensor<T> dst{ c.shape(), c.data_type(), 1, c.fixed_point_position() };
-
-    // Compute reference
-    using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
-
-    const int M = dst.shape().y();
-    const int N = dst.shape().x();
-    const int K = a.shape().x();
-    const int D = a.shape().z(); // Number of matrices in a batch
-    const int W = a.shape()[3];  // Number of batched-gemm (Winograd case)
-
-    const int a_stride_z = K * M;
-    const int a_stride_w = K * M * D;
-
-    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
-    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
-
-    const int c_stride_z = N * M;
-    const int c_stride_w = N * M * D;
-
-    const int            fixed_point_position = a.fixed_point_position();
-    const fixed_point<T> alpha_q(alpha, fixed_point_position);
-    const fixed_point<T> beta_q(beta, fixed_point_position);
-
-    for(int w = 0; w < W; ++w)
-    {
-        for(int depth = 0; depth < D; ++depth)
-        {
-            const int base_addr_a = depth * a_stride_z + w * a_stride_w;
-            const int base_addr_b = depth * b_stride_z + w * b_stride_w;
-            const int base_addr_c = depth * c_stride_z + w * c_stride_w;
-
-            for(int row = 0; row < M; ++row)
-            {
-                for(int col = 0; col < N; ++col)
-                {
-                    fixed_point<promoted_type> acc_q(0, fixed_point_position);
-
-                    for(int k = 0; k < K; ++k)
-                    {
-                        const fixed_point<promoted_type> a0_q(a[base_addr_a + row * K + k], fixed_point_position, true);
-                        const fixed_point<promoted_type> b0_q(b[base_addr_b + k * N + col], fixed_point_position, true);
-
-                        acc_q = acc_q + (a0_q * b0_q);
-                    }
-
-                    // Finalize the result: alpha * A * B + beta * C
-                    const fixed_point<T> c0_q(c[base_addr_c + col + row * N], fixed_point_position, true);
-
-                    fixed_point<T> res_q(acc_q);
-                    res_q = alpha_q * res_q;
-                    res_q = res_q + (beta_q * c0_q);
-
-                    // Store the result
-                    dst[base_addr_c + col + row * N] = res_q.raw();
-                }
-            }
-        }
-    }
-
-    return dst;
-}
-
 template SimpleTensor<float> gemm(const SimpleTensor<float> &a, const SimpleTensor<float> &b, const SimpleTensor<float> &c, float alpha, float beta);
 template SimpleTensor<half> gemm(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
-template SimpleTensor<qint8_t> gemm(const SimpleTensor<qint8_t> &a, const SimpleTensor<qint8_t> &b, const SimpleTensor<qint8_t> &c, float alpha, float beta);
-template SimpleTensor<qint16_t> gemm(const SimpleTensor<qint16_t> &a, const SimpleTensor<qint16_t> &b, const SimpleTensor<qint16_t> &c, float alpha, float beta);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/GEMM.h b/tests/validation/reference/GEMM.h
index cda792b..39007c6 100644
--- a/tests/validation/reference/GEMM.h
+++ b/tests/validation/reference/GEMM.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,8 +38,6 @@
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
 SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta);
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/GEMMInterleave4x4.h b/tests/validation/reference/GEMMInterleave4x4.h
index e6b09af..e3d72d9 100644
--- a/tests/validation/reference/GEMMInterleave4x4.h
+++ b/tests/validation/reference/GEMMInterleave4x4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "GEMM.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/reference/GEMMInterleaveBlocked.h b/tests/validation/reference/GEMMInterleaveBlocked.h
index ff5a0d6..d649a51 100644
--- a/tests/validation/reference/GEMMInterleaveBlocked.h
+++ b/tests/validation/reference/GEMMInterleaveBlocked.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "GEMM.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/reference/GEMMTranspose1xW.h b/tests/validation/reference/GEMMTranspose1xW.h
index d6a2e89..6ec70b1 100644
--- a/tests/validation/reference/GEMMTranspose1xW.h
+++ b/tests/validation/reference/GEMMTranspose1xW.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "GEMM.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/reference/Im2Col.cpp b/tests/validation/reference/Im2Col.cpp
index 5685b60..0c41d88 100644
--- a/tests/validation/reference/Im2Col.cpp
+++ b/tests/validation/reference/Im2Col.cpp
@@ -23,8 +23,6 @@
  */
 #include "Im2Col.h"
 
-#include "Permute.h"
-
 #include "arm_compute/core/Types.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/Utils.h"
@@ -38,36 +36,46 @@
 namespace reference
 {
 template <typename T>
-void im2col_nchw(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void im2col_nchw(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NCHW);
-    // Create reference
-    const int pad_x         = conv_info.pad().first;
-    const int pad_y         = conv_info.pad().second;
     const int stride_x      = conv_info.stride().first;
     const int stride_y      = conv_info.stride().second;
     const int kernel_width  = kernel_dims.width;
     const int kernel_height = kernel_dims.height;
+    const int pad_x         = conv_info.pad().first;
+    const int pad_y         = conv_info.pad().second;
     const int src_width     = src.shape().x();
     const int src_height    = src.shape().y();
-    const int src_depth     = src.shape().z();
+    const int src_channels  = src.shape().z();
     const int batches       = src.shape().total_size_upper(3);
+    const int dst_height    = dst.shape().y();
     const int pad_val       = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0;
+    int       dst_idx       = 0;
 
-    int dst_idx = 0;
+    // Compute width and height of the convolved tensors
+    std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src_width, src_height, kernel_dims.width, kernel_dims.height, conv_info);
+
     for(int b = 0; b < batches; ++b)
     {
-        for(int y = -pad_y; y <= (src_height + pad_y - kernel_height); y += stride_y)
+        for(int g = 0; g < static_cast<int>(num_groups); ++g)
         {
-            for(int x = -pad_x; x <= (src_width + pad_x - kernel_width); x += stride_x)
+            const int first_group_ch = g * (src_channels / num_groups);
+            const int last_group_ch  = (g + 1) * (src_channels / num_groups);
+
+            for(int yo = 0; yo < dst_height; ++yo)
             {
-                for(int z = 0; z < src_depth; ++z)
+                // Compute input spatial coordinates
+                const int xi = (yo % convolved_dims.first) * stride_x;
+                const int yi = (yo / convolved_dims.first) * stride_y;
+
+                for(int ci = first_group_ch; ci < last_group_ch; ++ci)
                 {
-                    for(int patch_y = y; patch_y < (y + kernel_height); ++patch_y)
+                    for(int yk = 0; yk < kernel_height; ++yk)
                     {
-                        for(int patch_x = x; patch_x < (x + kernel_width); ++patch_x)
+                        for(int xk = 0; xk < kernel_width; ++xk)
                         {
-                            dst[dst_idx++] = tensor_elem_at(src, Coordinates(patch_x, patch_y, z, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
+                            dst[dst_idx++] = tensor_elem_at(src, Coordinates(xi + xk - pad_x, yi + yk - pad_y, ci, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
                         }
                     }
                 }
@@ -97,11 +105,15 @@
     const int batches       = src.shape().total_size_upper(3);
     const int pad_val       = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0;
     int       dst_idx       = 0;
+
+    const int lasty = src_height + (kernel_height > 1 ? pad_y : 0) - kernel_height;
+    const int lastx = src_width + (kernel_width > 1 ? pad_x : 0) - kernel_width;
+
     for(int b = 0; b < batches; ++b)
     {
-        for(int y = -pad_y; y <= (src_height + pad_y - kernel_height); y += stride_y)
+        for(int y = -pad_y; y <= lasty; y += stride_y)
         {
-            for(int x = -pad_x; x <= (src_width + pad_x - kernel_width); x += stride_x)
+            for(int x = -pad_x; x <= lastx; x += stride_x)
             {
                 for(int z = 0; z < src_depth; ++z)
                 {
@@ -124,18 +136,74 @@
 }
 
 template <typename T>
-void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void im2col_nhwc_channel_first(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NHWC);
+    const int stride_x      = conv_info.stride().first;
+    const int stride_y      = conv_info.stride().second;
+    const int kernel_width  = kernel_dims.width;
+    const int kernel_height = kernel_dims.height;
+    const int pad_x         = conv_info.pad().first;
+    const int pad_y         = conv_info.pad().second;
+    const int src_width     = src.shape().y();
+    const int src_height    = src.shape().z();
+    const int src_channels  = src.shape().x();
+    const int batches       = src.shape().total_size_upper(3);
+    const int dst_width     = has_bias ? dst.shape().x() - 1 : dst.shape().x();
+    const int dst_height    = dst.shape().y();
+    const int pad_val       = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0;
+
+    // Compute width and height of the convolved tensors
+    std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src_width, src_height, kernel_dims.width, kernel_dims.height, conv_info);
+
+    for(int b = 0; b < batches; ++b)
+    {
+        for(int yo = 0; yo < dst_height; ++yo)
+        {
+            // Compute input spatial coordinates
+            const int xi = (yo % convolved_dims.first) * stride_x;
+            const int yi = (yo / convolved_dims.first) * stride_y;
+
+            for(int ci = 0; ci < src_channels; ++ci)
+            {
+                for(int yk = 0; yk < kernel_height; ++yk)
+                {
+                    for(int xk = 0; xk < kernel_width; ++xk)
+                    {
+                        dst[ci + (xk + yk * kernel_width) * src_channels + yo * dst.shape().x() + b * dst.shape().x() * dst.shape().y()] = tensor_elem_at(src, Coordinates(ci, xi + xk - pad_x, yi + yk - pad_y, b),
+                                                                                                                                           BorderMode::CONSTANT, static_cast<T>(pad_val));
+                    }
+                }
+            }
+
+            if(has_bias)
+            {
+                dst[dst_width + yo * dst.shape().x() + b * dst.shape().x() * dst.shape().y()] = static_cast<T>(1);
+            }
+        }
+    }
+}
+
+template <typename T>
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups, bool channels_first_output_nhwc)
 {
     switch(src.data_layout())
     {
         case DataLayout::NCHW:
         {
-            im2col_nchw(src, dst, kernel_dims, conv_info, has_bias);
+            im2col_nchw(src, dst, kernel_dims, conv_info, has_bias, num_groups);
             break;
         }
         case DataLayout::NHWC:
         {
-            im2col_nhwc(src, dst, kernel_dims, conv_info, has_bias);
+            if(channels_first_output_nhwc)
+            {
+                im2col_nhwc_channel_first(src, dst, kernel_dims, conv_info, has_bias);
+            }
+            else
+            {
+                im2col_nhwc(src, dst, kernel_dims, conv_info, has_bias);
+            }
             break;
         }
         default:
@@ -146,9 +214,12 @@
     }
 }
 
-template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
-template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
-template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
+                     bool channels_first_output_nhwc);
+template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
+                     bool channels_first_output_nhwc);
+template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
+                     bool channels_first_output_nhwc);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Im2Col.h b/tests/validation/reference/Im2Col.h
index 5277171..84ee237 100644
--- a/tests/validation/reference/Im2Col.h
+++ b/tests/validation/reference/Im2Col.h
@@ -35,7 +35,8 @@
 namespace reference
 {
 template <typename T>
-void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups,
+            bool channels_first_output_nhwc = false);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/LaplacianPyramid.cpp b/tests/validation/reference/LaplacianPyramid.cpp
new file mode 100644
index 0000000..5668474
--- /dev/null
+++ b/tests/validation/reference/LaplacianPyramid.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "LaplacianPyramid.h"
+
+#include "tests/validation/reference/ArithmeticSubtraction.h"
+#include "tests/validation/reference/DepthConvertLayer.h"
+#include "tests/validation/reference/Gaussian5x5.h"
+#include "tests/validation/reference/GaussianPyramidHalf.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename U>
+std::vector<SimpleTensor<U>> laplacian_pyramid(const SimpleTensor<T> &src, SimpleTensor<U> &dst, size_t num_levels, BorderMode border_mode, uint8_t constant_border_value)
+{
+    std::vector<SimpleTensor<T>> pyramid_conv;
+    std::vector<SimpleTensor<U>> pyramid_dst;
+
+    // First, a Gaussian pyramid with SCALE_PYRAMID_HALF is created
+    std::vector<SimpleTensor<T>> gaussian_level_pyramid = reference::gaussian_pyramid_half(src, border_mode, constant_border_value, num_levels);
+
+    // For each level i, the corresponding image Ii is blurred with Gaussian 5x5
+    // filter, and the difference between the two images is the corresponding
+    // level Li of the Laplacian pyramid
+    for(size_t i = 0; i < num_levels; ++i)
+    {
+        const SimpleTensor<T> level_filtered = reference::gaussian5x5(gaussian_level_pyramid[i], border_mode, constant_border_value);
+        pyramid_conv.push_back(level_filtered);
+
+        const SimpleTensor<U> level_sub = reference::arithmetic_subtraction<T, T, U>(gaussian_level_pyramid[i], level_filtered, dst.data_type(), ConvertPolicy::WRAP);
+        pyramid_dst.push_back(level_sub);
+    }
+
+    // Return the lowest resolution image and the pyramid
+    dst = depth_convert<T, U>(pyramid_conv[num_levels - 1], DataType::S16, ConvertPolicy::WRAP, 0);
+
+    return pyramid_dst;
+}
+
+template std::vector<SimpleTensor<int16_t>> laplacian_pyramid(const SimpleTensor<uint8_t> &src, SimpleTensor<int16_t> &dst, size_t num_levels, BorderMode border_mode, uint8_t constant_border_value);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/FixedPoint.h b/tests/validation/reference/LaplacianPyramid.h
similarity index 77%
copy from tests/validation/reference/FixedPoint.h
copy to tests/validation/reference/LaplacianPyramid.h
index f0117f9..aa76f56 100644
--- a/tests/validation/reference/FixedPoint.h
+++ b/tests/validation/reference/LaplacianPyramid.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
-#define __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
+#ifndef __ARM_COMPUTE_TEST_LAPLACIAN_PYRAMID_H__
+#define __ARM_COMPUTE_TEST_LAPLACIAN_PYRAMID_H__
 
 #include "tests/SimpleTensor.h"
-#include "tests/Types.h"
 
 namespace arm_compute
 {
@@ -35,10 +34,10 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<T> fixed_point_operation(const SimpleTensor<T> &src, FixedPointOp op);
+template <typename T, typename U>
+std::vector<SimpleTensor<U>> laplacian_pyramid(const SimpleTensor<T> &src, SimpleTensor<U> &dst, size_t num_levels, BorderMode border_mode, uint8_t constant_border_value);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__ */
+#endif /* __ARM_COMPUTE_TEST_LAPLACIAN_PYRAMID_H__ */
diff --git a/tests/validation/reference/LaplacianReconstruct.cpp b/tests/validation/reference/LaplacianReconstruct.cpp
new file mode 100644
index 0000000..2346828
--- /dev/null
+++ b/tests/validation/reference/LaplacianReconstruct.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "LaplacianReconstruct.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/reference/ArithmeticAddition.h"
+#include "tests/validation/reference/DepthConvertLayer.h"
+#include "tests/validation/reference/Scale.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename U>
+SimpleTensor<U> laplacian_reconstruct(const std::vector<SimpleTensor<T>> &pyramid, const SimpleTensor<T> &low_res, BorderMode border_mode, T constant_border_value)
+{
+    std::vector<SimpleTensor<T>> tmp_pyramid(pyramid);
+
+    const size_t   last_level = pyramid.size() - 1;
+    const DataType data_type  = low_res.data_type();
+
+    // input + L(n-1)
+    tmp_pyramid[last_level] = reference::arithmetic_addition(low_res, pyramid[last_level], data_type, ConvertPolicy::SATURATE);
+
+    // Scale levels n-1 to 1, and add levels n-2 to 0
+    for(size_t i = last_level; i-- > 0;)
+    {
+        const float scale_x = static_cast<float>(tmp_pyramid[i].shape().x()) / tmp_pyramid[i + 1].shape().x();
+        const float scale_y = static_cast<float>(tmp_pyramid[i].shape().y()) / tmp_pyramid[i + 1].shape().y();
+
+        tmp_pyramid[i] = reference::scale(tmp_pyramid[i + 1], scale_x, scale_y, InterpolationPolicy::NEAREST_NEIGHBOR,
+                                          border_mode, constant_border_value, SamplingPolicy::CENTER, false);
+
+        tmp_pyramid[i] = reference::arithmetic_addition(tmp_pyramid[i], pyramid[i], data_type, ConvertPolicy::SATURATE);
+    }
+
+    return reference::depth_convert<T, U>(tmp_pyramid[0], DataType::U8, ConvertPolicy::SATURATE, 0);
+}
+
+template SimpleTensor<uint8_t> laplacian_reconstruct(const std::vector<SimpleTensor<int16_t>> &pyramid, const SimpleTensor<int16_t> &low_res, BorderMode border_mode, int16_t constant_border_value);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/FixedPoint.h b/tests/validation/reference/LaplacianReconstruct.h
similarity index 76%
copy from tests/validation/reference/FixedPoint.h
copy to tests/validation/reference/LaplacianReconstruct.h
index f0117f9..76851c6 100644
--- a/tests/validation/reference/FixedPoint.h
+++ b/tests/validation/reference/LaplacianReconstruct.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
-#define __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
+#ifndef __ARM_COMPUTE_TEST_LAPLACIAN_RECONSTRUCT_H__
+#define __ARM_COMPUTE_TEST_LAPLACIAN_RECONSTRUCT_H__
 
 #include "tests/SimpleTensor.h"
-#include "tests/Types.h"
 
 namespace arm_compute
 {
@@ -35,10 +34,10 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<T> fixed_point_operation(const SimpleTensor<T> &src, FixedPointOp op);
+template <typename T, typename U>
+SimpleTensor<U> laplacian_reconstruct(const std::vector<SimpleTensor<T>> &pyramid, const SimpleTensor<T> &low_res, BorderMode border_mode, T constant_border_value);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__ */
+#endif /* __ARM_COMPUTE_TEST_LAPLACIAN_RECONSTRUCT_H__ */
diff --git a/tests/validation/reference/LocallyConnected.cpp b/tests/validation/reference/LocallyConnected.cpp
index 08e3f02..ecc582b 100644
--- a/tests/validation/reference/LocallyConnected.cpp
+++ b/tests/validation/reference/LocallyConnected.cpp
@@ -41,7 +41,7 @@
 SimpleTensor<T> locally_connected(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info)
 {
     // Create reference
-    SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.fixed_point_position(), src.quantization_info() };
+    SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.quantization_info() };
 
     // Compute reference
     const int width_in  = src.shape().x();
diff --git a/tests/validation/reference/MeanStdDev.cpp b/tests/validation/reference/MeanStdDev.cpp
index 4a39b13..f48fcb1 100644
--- a/tests/validation/reference/MeanStdDev.cpp
+++ b/tests/validation/reference/MeanStdDev.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,6 +51,8 @@
 }
 
 template std::pair<float, float> mean_and_standard_deviation(const SimpleTensor<uint8_t> &in);
+template std::pair<float, float> mean_and_standard_deviation(const SimpleTensor<half> &in);
+template std::pair<float, float> mean_and_standard_deviation(const SimpleTensor<float> &in);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/NormalizationLayer.cpp b/tests/validation/reference/NormalizationLayer.cpp
index 226af96..2ae68c6 100644
--- a/tests/validation/reference/NormalizationLayer.cpp
+++ b/tests/validation/reference/NormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "NormalizationLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 
 namespace arm_compute
 {
@@ -38,7 +37,7 @@
 SimpleTensor<T> normalization_layer(const SimpleTensor<T> &src, NormalizationLayerInfo info)
 {
     // Create reference
-    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1, src.fixed_point_position() };
+    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
 
     // Compute reference
     const uint32_t norm_size = info.norm_size();
@@ -146,129 +145,8 @@
     return dst;
 }
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type>
-SimpleTensor<T> normalization_layer(const SimpleTensor<T> &src, NormalizationLayerInfo info)
-{
-    using namespace fixed_point_arithmetic;
-
-    // Create reference
-    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1, src.fixed_point_position() };
-
-    // Compute reference
-    const int fixed_point_position = src.fixed_point_position();
-
-    const uint32_t norm_size = info.norm_size();
-    NormType       type      = info.type();
-    fixed_point<T> beta(info.beta(), fixed_point_position);
-    fixed_point<T> kappa(info.kappa(), fixed_point_position);
-
-    const int cols       = src.shape()[0];
-    const int rows       = src.shape()[1];
-    const int depth      = src.shape()[2];
-    int       upper_dims = src.shape().total_size() / (cols * rows);
-
-    fixed_point<T> coeff(info.scale_coeff(), fixed_point_position);
-    int            radius_cols = norm_size / 2;
-
-    // IN_MAP_1D and CROSS_MAP normalize over a single axis only
-    int radius_rows = (NormType::IN_MAP_2D == type) ? norm_size / 2 : 0;
-
-    if(type == NormType::CROSS_MAP)
-    {
-        // Remove also depth from upper dimensions since it is the dimension we
-        // want to use for normalization
-        upper_dims /= depth;
-
-        for(int r = 0; r < upper_dims; ++r)
-        {
-            for(int i = 0; i < rows; ++i)
-            {
-                for(int k = 0; k < cols; ++k)
-                {
-                    for(int l = 0; l < depth; ++l)
-                    {
-                        fixed_point<T> accumulated_scale(0.f, fixed_point_position);
-
-                        for(int j = -radius_cols; j <= radius_cols; ++j)
-                        {
-                            const int z = l + j;
-
-                            if(z >= 0 && z < depth)
-                            {
-                                const T              value = src[k + i * cols + z * rows * cols + r * cols * rows * depth];
-                                const fixed_point<T> fp_value(value, fixed_point_position, true);
-                                accumulated_scale = add(accumulated_scale, mul(fp_value, fp_value));
-                            }
-                        }
-
-                        accumulated_scale                                             = add(kappa, mul(accumulated_scale, coeff));
-                        dst[k + i * cols + l * rows * cols + r * cols * rows * depth] = accumulated_scale.raw();
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        for(int r = 0; r < upper_dims; ++r)
-        {
-            for(int i = 0; i < rows; ++i)
-            {
-                for(int k = 0; k < cols; ++k)
-                {
-                    fixed_point<T> accumulated_scale(0.f, fixed_point_position);
-
-                    for(int j = -radius_rows; j <= radius_rows; ++j)
-                    {
-                        const int y = i + j;
-
-                        for(int l = -radius_cols; l <= radius_cols; ++l)
-                        {
-                            const int x = k + l;
-
-                            if((x >= 0 && y >= 0) && (x < cols && y < rows))
-                            {
-                                const T              value = src[x + y * cols + r * cols * rows];
-                                const fixed_point<T> fp_value(value, fixed_point_position, true);
-                                accumulated_scale = add(accumulated_scale, mul(fp_value, fp_value));
-                            }
-                        }
-                    }
-
-                    accumulated_scale                   = add(kappa, mul(accumulated_scale, coeff));
-                    dst[k + i * cols + r * cols * rows] = accumulated_scale.raw();
-                }
-            }
-        }
-    }
-
-    if(info.beta() == 1.f)
-    {
-        for(int i = 0; i < dst.num_elements(); ++i)
-        {
-            fixed_point<T> res = div(fixed_point<T>(src[i], fixed_point_position, true), fixed_point<T>(dst[i], fixed_point_position, true));
-            dst[i]             = res.raw();
-        }
-    }
-    else
-    {
-        const fixed_point<T> beta(info.beta(), fixed_point_position);
-
-        for(int i = 0; i < dst.num_elements(); ++i)
-        {
-            fixed_point<T> res = pow(fixed_point<T>(dst[i], fixed_point_position, true), beta);
-            res                = div(fixed_point<T>(src[i], fixed_point_position, true), res);
-            dst[i]             = res.raw();
-        }
-    }
-
-    return dst;
-}
-
 template SimpleTensor<float> normalization_layer(const SimpleTensor<float> &src, NormalizationLayerInfo info);
 template SimpleTensor<half> normalization_layer(const SimpleTensor<half> &src, NormalizationLayerInfo info);
-template SimpleTensor<qint8_t> normalization_layer(const SimpleTensor<qint8_t> &src, NormalizationLayerInfo info);
-template SimpleTensor<qint16_t> normalization_layer(const SimpleTensor<qint16_t> &src, NormalizationLayerInfo info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/NormalizationLayer.h b/tests/validation/reference/NormalizationLayer.h
index 3f624ff..3448baf 100644
--- a/tests/validation/reference/NormalizationLayer.h
+++ b/tests/validation/reference/NormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,8 +38,6 @@
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
 SimpleTensor<T> normalization_layer(const SimpleTensor<T> &src, NormalizationLayerInfo info);
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-SimpleTensor<T> normalization_layer(const SimpleTensor<T> &src, NormalizationLayerInfo info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Permute.cpp b/tests/validation/reference/Permute.cpp
index bbb2e8d..29c3c5c 100644
--- a/tests/validation/reference/Permute.cpp
+++ b/tests/validation/reference/Permute.cpp
@@ -42,7 +42,7 @@
     permute(dst_shape, perm);
 
     // Create reference
-    SimpleTensor<T> dst{ dst_shape, src.data_type(), src.num_channels(), src.fixed_point_position(), src.quantization_info() };
+    SimpleTensor<T> dst{ dst_shape, src.data_type(), src.num_channels(), src.quantization_info() };
 
     // Compute reference
     for(int i = 0; i < src.num_elements(); ++i)
diff --git a/tests/validation/reference/PixelWiseMultiplication.cpp b/tests/validation/reference/PixelWiseMultiplication.cpp
index 546a886..859da5c 100644
--- a/tests/validation/reference/PixelWiseMultiplication.cpp
+++ b/tests/validation/reference/PixelWiseMultiplication.cpp
@@ -23,8 +23,6 @@
  */
 #include "PixelWiseMultiplication.h"
 
-#include "tests/validation/FixedPoint.h"
-
 namespace arm_compute
 {
 namespace test
@@ -45,10 +43,10 @@
 {
 /** Compute the result of `src1 * src2 * scale`. The result type always matches the type of @p src2.
  *
- * @param[in] src1            An input value. Data types supported: U8/QS8/QS16/S16/F16/F32.
+ * @param[in] src1            An input value. Data types supported: U8/S16/F16/F32.
  * @param[in] src2            An input value. Data types supported: same as @p src1.
  * @param[in] scale           Scale to apply after multiplication.
- *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+ *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
  * @param[in] convert_policy  Overflow policy. Supported overflow policies: Wrap, Saturate
  * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
  */
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index 6973454..02c430a 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -25,7 +25,6 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -44,7 +43,7 @@
     ARM_COMPUTE_ERROR_ON(info.is_global_pooling() && (src.shape().x() != src.shape().y()));
 
     // Create reference
-    SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type(), src.fixed_point_position()), info), src.data_type(), 1, src.fixed_point_position() };
+    SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info), src.data_type(), 1 };
 
     const int   pool_size_x     = info.is_global_pooling() ? src.shape().x() : info.pool_size().width;
     const int   pool_size_y     = info.is_global_pooling() ? src.shape().y() : info.pool_size().height;
@@ -152,128 +151,6 @@
     return dst;
 }
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type>
-SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON(info.is_global_pooling() && (src.shape().x() != src.shape().y()));
-
-    const auto w_src      = static_cast<int>(src.shape()[0]);
-    const auto h_src      = static_cast<int>(src.shape()[1]);
-    const int  upper_dims = src.shape().total_size() / (w_src * h_src);
-
-    const int   pool_size_x     = info.is_global_pooling() ? src.shape().x() : info.pool_size().width;
-    const int   pool_size_y     = info.is_global_pooling() ? src.shape().y() : info.pool_size().height;
-    PoolingType type            = info.pool_type();
-    int         pool_stride_x   = info.pad_stride_info().stride().first;
-    int         pool_stride_y   = info.pad_stride_info().stride().second;
-    int         pad_left        = info.pad_stride_info().pad_left();
-    int         pad_top         = info.pad_stride_info().pad_top();
-    int         pad_right       = info.pad_stride_info().pad_right();
-    int         pad_bottom      = info.pad_stride_info().pad_bottom();
-    bool        exclude_padding = info.exclude_padding();
-
-    // Create reference
-    SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type(), src.fixed_point_position()), info), src.data_type(), 1, src.fixed_point_position() };
-
-    const auto w_dst = static_cast<int>(dst.shape()[0]);
-    const auto h_dst = static_cast<int>(dst.shape()[1]);
-
-    if(type == PoolingType::MAX)
-    {
-        for(int r = 0; r < upper_dims; ++r)
-        {
-            for(int h = 0; h < h_dst; ++h)
-            {
-                for(int w = 0; w < w_dst; ++w)
-                {
-                    int wstart = w * pool_stride_x - pad_left;
-                    int hstart = h * pool_stride_y - pad_top;
-                    int wend   = std::min(wstart + pool_size_x, w_src);
-                    int hend   = std::min(hstart + pool_size_y, h_src);
-                    wstart     = std::max(wstart, 0);
-                    hstart     = std::max(hstart, 0);
-
-                    T max_val = std::numeric_limits<T>::lowest();
-                    for(int y = hstart; y < hend; ++y)
-                    {
-                        for(int x = wstart; x < wend; ++x)
-                        {
-                            const T val = src[r * h_src * w_src + y * w_src + x];
-                            if(val > max_val)
-                            {
-                                max_val = val;
-                            }
-                        }
-                    }
-
-                    dst[r * h_dst * w_dst + h * w_dst + w] = max_val;
-                }
-            }
-        }
-    }
-    else // Average or l2 pooling
-    {
-        for(int r = 0; r < upper_dims; ++r)
-        {
-            for(int h = 0; h < h_dst; ++h)
-            {
-                for(int w = 0; w < w_dst; ++w)
-                {
-                    int wstart = w * pool_stride_x - pad_left;
-                    int hstart = h * pool_stride_y - pad_top;
-                    int wend   = std::min(wstart + pool_size_x, w_src + pad_right);
-                    int hend   = std::min(hstart + pool_size_y, h_src + pad_bottom);
-                    int pool   = (hend - hstart) * (wend - wstart);
-                    wstart     = std::max(wstart, 0);
-                    hstart     = std::max(hstart, 0);
-                    wend       = std::min(wend, w_src);
-                    hend       = std::min(hend, h_src);
-                    // Exclude padding pixels from the average
-                    if(exclude_padding)
-                    {
-                        pool = (hend - hstart) * (wend - wstart);
-                    }
-
-                    using namespace fixed_point_arithmetic;
-
-                    const int            fixed_point_position = src.fixed_point_position();
-                    const fixed_point<T> const_1(1, fixed_point_position);
-                    const fixed_point<T> invpool_fp(1.f / static_cast<float>(pool), fixed_point_position);
-                    fixed_point<T>       avg_val(0, fixed_point_position, true);
-
-                    if(type == PoolingType::AVG)
-                    {
-                        for(int y = hstart; y < hend; ++y)
-                        {
-                            for(int x = wstart; x < wend; ++x)
-                            {
-                                const fixed_point<T> in_fp(src[r * h_src * w_src + y * w_src + x], fixed_point_position, true);
-                                avg_val = add(avg_val, in_fp);
-                            }
-                        }
-                        dst[r * h_dst * w_dst + h * w_dst + w] = mul(avg_val, invpool_fp).raw();
-                    }
-                    else
-                    {
-                        for(int y = hstart; y < hend; ++y)
-                        {
-                            for(int x = wstart; x < wend; ++x)
-                            {
-                                const fixed_point<T> in_fp(src[r * h_src * w_src + y * w_src + x], fixed_point_position, true);
-                                avg_val = add(avg_val, mul(in_fp, in_fp));
-                            }
-                        }
-                        auto res                               = div(const_1, (inv_sqrt(mul(avg_val, invpool_fp))));
-                        dst[r * h_dst * w_dst + h * w_dst + w] = res.raw();
-                    }
-                }
-            }
-        }
-    }
-
-    return dst;
-}
-
 template <>
 SimpleTensor<uint8_t> pooling_layer<uint8_t>(const SimpleTensor<uint8_t> &src, const PoolingLayerInfo &info)
 {
@@ -285,8 +162,6 @@
 
 template SimpleTensor<float> pooling_layer(const SimpleTensor<float> &src, const PoolingLayerInfo &info);
 template SimpleTensor<half> pooling_layer(const SimpleTensor<half> &src, const PoolingLayerInfo &info);
-template SimpleTensor<qint8_t> pooling_layer(const SimpleTensor<qint8_t> &src, const PoolingLayerInfo &info);
-template SimpleTensor<qint16_t> pooling_layer(const SimpleTensor<qint16_t> &src, const PoolingLayerInfo &info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index acfcc09..871a761 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,7 @@
     switch(op)
     {
         case ReductionOperation::SUM_SQUARE:
-            return std::accumulate(ptr, ptr + reduce_elements, 0.f, square<T>());
+            return std::accumulate(ptr, ptr + reduce_elements, static_cast<T>(0), square<T>());
         default:
             ARM_COMPUTE_ERROR("Unsupported reduction operation");
     }
@@ -87,6 +87,7 @@
 }
 
 template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp
index 90b9b1f..aa640ad 100644
--- a/tests/validation/reference/SoftmaxLayer.cpp
+++ b/tests/validation/reference/SoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "SoftmaxLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 
 namespace arm_compute
 {
@@ -38,7 +37,7 @@
 SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta)
 {
     // Create reference
-    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1, src.fixed_point_position() };
+    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
 
     // Compute reference
     const int cols       = src.shape()[0];
@@ -71,65 +70,21 @@
     return dst;
 }
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type>
+template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type>
 SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta)
 {
-    ARM_COMPUTE_UNUSED(beta);
-
-    using namespace fixed_point_arithmetic;
-
-    // Create reference
-    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1, src.fixed_point_position() };
-
-    // Compute reference
-    const int cols       = src.shape()[0];
-    const int upper_dims = src.num_elements() / cols;
-
-    for(int r = 0; r < upper_dims; ++r)
-    {
-        const T *src_row_ptr = src.data() + r * cols;
-        T       *dst_row_ptr = dst.data() + r * cols;
-
-        // Find max
-        const fixed_point<T> max(*std::max_element(src_row_ptr, src_row_ptr + cols), src.fixed_point_position(), true);
-
-        // Regularize
-        using promoted_type = fixed_point_arithmetic::traits::promote_t<T>;
-        fixed_point<promoted_type> sum(0, src.fixed_point_position(), true);
-        std::transform(src_row_ptr, src_row_ptr + cols, dst_row_ptr, [&](T val)
-        {
-            const fixed_point<T> res = exp(fixed_point<T>(val, src.fixed_point_position(), true) - max);
-            sum                      = add(sum, fixed_point<promoted_type>(res.raw(), src.fixed_point_position(), true));
-            return res.raw();
-        });
-
-        // Normalize
-        fixed_point<T> saturated_sum(sum);
-        std::transform(dst_row_ptr, dst_row_ptr + cols, dst_row_ptr, [&](T val)
-        {
-            return div(fixed_point<T>(val, src.fixed_point_position(), true), saturated_sum).raw();
-        });
-    }
-
-    return dst;
-}
-
-template <>
-SimpleTensor<uint8_t> softmax_layer<uint8_t>(const SimpleTensor<uint8_t> &src, float beta)
-{
     // Note: Output quantization info should always have scale = 1/256 and offset = 0
     const QuantizationInfo output_quantization_info = QuantizationInfo(1.f / 256, 0);
 
-    SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float>   dst_tmp = softmax_layer<float>(src_tmp, beta);
-    SimpleTensor<uint8_t> dst     = convert_to_asymmetric(dst_tmp, output_quantization_info);
+    SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta);
+    SimpleTensor<T>     dst     = convert_to_asymmetric(dst_tmp, output_quantization_info);
     return dst;
 }
 
 template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float beta);
 template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta);
-template SimpleTensor<qint8_t> softmax_layer(const SimpleTensor<qint8_t> &src, float beta);
-template SimpleTensor<qint16_t> softmax_layer(const SimpleTensor<qint16_t> &src, float beta);
+template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/SoftmaxLayer.h b/tests/validation/reference/SoftmaxLayer.h
index a6d4c3b..21dca1e 100644
--- a/tests/validation/reference/SoftmaxLayer.h
+++ b/tests/validation/reference/SoftmaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
 SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta);
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type = 0>
 SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/Transpose.cpp b/tests/validation/reference/Transpose.cpp
index 736f37e..348c703 100644
--- a/tests/validation/reference/Transpose.cpp
+++ b/tests/validation/reference/Transpose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "Transpose.h"
 
 #include "arm_compute/core/Types.h"
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
diff --git a/tests/validation/reference/WeightsReshape.cpp b/tests/validation/reference/WeightsReshape.cpp
new file mode 100644
index 0000000..fc02395
--- /dev/null
+++ b/tests/validation/reference/WeightsReshape.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "WeightsReshape.h"
+
+#include "tests/validation/Helpers.h"
+#include "tests/validation/reference/Utils.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> weights_reshape(const SimpleTensor<T> &src, const SimpleTensor<T> &biases, const TensorShape &dst_shape, const unsigned int num_groups)
+{
+    SimpleTensor<T> dst{ dst_shape, src.data_type(), 1 };
+
+    // Compute reference
+    const bool   has_bias  = biases.size() > 0;
+    const size_t linear_sz = src.shape().total_size_lower(3);
+    const size_t group_sz  = src.shape()[3] / num_groups;
+
+    for(size_t g = 0; g < num_groups; ++g)
+    {
+        for(size_t w = 0; w < group_sz; ++w)
+        {
+            const size_t curr_weight = g * group_sz + w;
+
+            size_t i = 0;
+            for(; i < linear_sz; ++i)
+            {
+                dst[coord2index(dst.shape(), Coordinates(w, i, g))] = src[curr_weight * linear_sz + i];
+            }
+            if(has_bias)
+            {
+                dst[coord2index(dst.shape(), Coordinates(w, i, g))] = static_cast<T>(biases[curr_weight]);
+            }
+        }
+    }
+
+    return dst;
+}
+
+template SimpleTensor<float> weights_reshape(const SimpleTensor<float> &src, const SimpleTensor<float> &biases, const TensorShape &dst_shape, const unsigned int num_groups);
+template SimpleTensor<half> weights_reshape(const SimpleTensor<half> &src, const SimpleTensor<half> &biases, const TensorShape &dst_shape, const unsigned int num_groups);
+template SimpleTensor<uint8_t> weights_reshape(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &biases, const TensorShape &dst_shape, const unsigned int num_groups);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/FixedPoint.h b/tests/validation/reference/WeightsReshape.h
similarity index 78%
copy from tests/validation/reference/FixedPoint.h
copy to tests/validation/reference/WeightsReshape.h
index f0117f9..629f1e5 100644
--- a/tests/validation/reference/FixedPoint.h
+++ b/tests/validation/reference/WeightsReshape.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
-#define __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__
+#ifndef __ARM_COMPUTE_TEST_WEIGHTS_RESHAPE_H__
+#define __ARM_COMPUTE_TEST_WEIGHTS_RESHAPE_H__
 
 #include "tests/SimpleTensor.h"
-#include "tests/Types.h"
+#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -36,9 +36,9 @@
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> fixed_point_operation(const SimpleTensor<T> &src, FixedPointOp op);
+SimpleTensor<T> weights_reshape(const SimpleTensor<T> &src, const SimpleTensor<T> &biases, const TensorShape &dst_shape, const unsigned int num_groups);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_FIXED_POINT_OPERATION_H__ */
+#endif /* __ARM_COMPUTE_TEST_WEIGHTS_RESHAPE_H__ */
diff --git a/tests/validation/reference/WidthConcatenateLayer.cpp b/tests/validation/reference/WidthConcatenateLayer.cpp
index fe79b4a..8662199 100644
--- a/tests/validation/reference/WidthConcatenateLayer.cpp
+++ b/tests/validation/reference/WidthConcatenateLayer.cpp
@@ -23,7 +23,6 @@
  */
 #include "WidthConcatenateLayer.h"
 
-#include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -85,8 +84,7 @@
 
 template SimpleTensor<float> widthconcatenate_layer(const std::vector<SimpleTensor<float>> &srcs);
 template SimpleTensor<half> widthconcatenate_layer(const std::vector<SimpleTensor<half>> &srcs);
-template SimpleTensor<qint8_t> widthconcatenate_layer(const std::vector<SimpleTensor<qint8_t>> &srcs);
-template SimpleTensor<qint16_t> widthconcatenate_layer(const std::vector<SimpleTensor<qint16_t>> &srcs);
+template SimpleTensor<uint8_t> widthconcatenate_layer(const std::vector<SimpleTensor<uint8_t>> &srcs);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 194a78e..132d252 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Types.h"
 
 #include <algorithm>
+#include <cmath>
 
 namespace arm_compute
 {
@@ -142,13 +143,31 @@
     {
         { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3), WinogradTransformType::INPUT), imatrix2x2_3x3 },
         { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3), WinogradTransformType::INPUT), imatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1), WinogradTransformType::INPUT), imatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1), WinogradTransformType::INPUT), imatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3), WinogradTransformType::INPUT), imatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3), WinogradTransformType::INPUT), imatrix4x4_3x3 },
         { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::INPUT), imatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1), WinogradTransformType::INPUT), imatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5), WinogradTransformType::INPUT), imatrix4x4_5x5 },
         { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3), WinogradTransformType::FILTER), fmatrix2x2_3x3 },
         { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3), WinogradTransformType::FILTER), fmatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1), WinogradTransformType::FILTER), fmatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1), WinogradTransformType::FILTER), fmatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3), WinogradTransformType::FILTER), fmatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3), WinogradTransformType::FILTER), fmatrix4x4_3x3 },
         { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::FILTER), fmatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1), WinogradTransformType::FILTER), fmatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5), WinogradTransformType::FILTER), fmatrix4x4_5x5 },
         { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3), WinogradTransformType::OUTPUT), omatrix2x2_3x3 },
         { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3), WinogradTransformType::OUTPUT), omatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1), WinogradTransformType::OUTPUT), omatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1), WinogradTransformType::OUTPUT), omatrix4x4_3x3 },
+        { WinogradKey(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3), WinogradTransformType::OUTPUT), omatrix2x2_3x3 },
+        { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3), WinogradTransformType::OUTPUT), omatrix4x4_3x3 },
         { WinogradKey(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5), WinogradTransformType::OUTPUT), omatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1), WinogradTransformType::OUTPUT), omatrix4x4_5x5 },
+        { WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5), WinogradTransformType::OUTPUT), omatrix4x4_5x5 },
     };
 
     // Find transformation matrix
@@ -189,7 +208,10 @@
     const unsigned int tile_w = output_tile_size.width + kernel_size.width - 1;
     const unsigned int tile_h = output_tile_size.height + kernel_size.height - 1;
 
-    TensorShape tile_dims(tile_w, tile_h);
+    // Get the maximum dimension from the tile size
+    const unsigned int tile_max_dim = std::max(tile_w, tile_h);
+
+    TensorShape tile_dims(tile_max_dim, tile_max_dim);
 
     // Simple tensor for the input tile
     SimpleTensor<T> src_tile{ tile_dims, in.data_type() };
@@ -217,11 +239,46 @@
     const int in_d        = in.shape().z();
     const int out_d       = out.shape().z();
     const int num_batches = in.shape().total_size() / (in_w * in_h * in_d);
-    const int num_tiles_x = std::ceil((in_w - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>(output_tile_size.width));
-    const int num_tiles_y = std::ceil((in_h - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>(output_tile_size.height));
     const int step_x      = output_tile_size.width;
     const int step_y      = output_tile_size.height;
 
+    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
+    const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(in_w, in_h),
+                                                                kernel_size,
+                                                                output_tile_size,
+                                                                conv_info);
+
+    const int num_tiles_x = num_tiles.width;
+    const int num_tiles_y = num_tiles.height;
+
+    // In case of 1D convolution, the input tile has to be partially filled with zeros
+    int start_x_zero = 0;
+    int start_y_zero = 0;
+    int end_x_zero   = 0;
+    int end_y_zero   = 0;
+
+    if(output_tile_size.width == 1)
+    {
+        start_x_zero = 1;
+        start_y_zero = 0;
+        end_x_zero   = tile_max_dim - 1;
+        end_y_zero   = tile_max_dim;
+    }
+    else if(output_tile_size.height == 1)
+    {
+        start_x_zero = 0;
+        start_y_zero = 1;
+        end_x_zero   = tile_max_dim;
+        end_y_zero   = tile_max_dim - 1;
+    }
+
+    // Set the anchor and shape of the zeros area
+    const Coordinates anchor_zeros(start_x_zero, start_y_zero);
+    const TensorShape shape_zeros(end_x_zero, end_y_zero);
+
+    // If we have a vertical filter (i.e. 1x3, 1x5,..), we need to take the elements along the y direction (step = width of the output tile)
+    const int step_y_transf_tile = kernel_size.width == 1 ? tile_max_dim : 1;
+
     ARM_COMPUTE_ERROR_ON((num_tiles_x * num_tiles_y) != static_cast<int>(out.shape().y()));
 
     for(int b = 0; b < num_batches; ++b)
@@ -238,6 +295,9 @@
                     // Get the tile from the input tensor
                     get_tile(in, src_tile, Coordinates(xi, yi, z, b));
 
+                    // Fill partially with zeros in case of 1D convolution
+                    zeros(src_tile, anchor_zeros, shape_zeros);
+
                     // Compute the transformation
                     matrix_multiply(matrix, src_tile, tmp_tile);
                     matrix_multiply(tmp_tile, matrix_transposed, dst_tile);
@@ -247,7 +307,7 @@
                     {
                         int xo = z;
                         int yo = x + y * num_tiles_x;
-                        out[coords2index(out.shape(), Coordinates(xo, yo, i, b))] = dst_tile[i];
+                        out[coords2index(out.shape(), Coordinates(xo, yo, i, b))] = dst_tile[i * step_y_transf_tile];
                     }
                 }
             }
@@ -268,27 +328,31 @@
     const Size2D output_tile_size = winograd_info.output_tile_size;
     const Size2D kernel_size      = winograd_info.kernel_size;
 
-    TensorShape kernel_tile_dims(kernel_size.width, kernel_size.height);
-
     // Calculate dimensions for the tile
     const unsigned int input_tile_w    = output_tile_size.width + kernel_size.width - 1;
     const unsigned int input_tile_h    = output_tile_size.height + kernel_size.height - 1;
     const unsigned int input_tile_area = input_tile_w * input_tile_h;
 
+    // Get the maximum dimension from the filter size
+    const unsigned int kernel_max_dim = std::max(kernel_size.width, kernel_size.height);
+
+    // Get the maximum dimension from the input tile
+    const unsigned int input_tile_max_dim = std::max(input_tile_w, input_tile_h);
+
     // Simple tensor for the input tile
-    SimpleTensor<T> input_tile{ kernel_tile_dims, in.data_type(), 1 };
+    SimpleTensor<T> input_tile{ TensorShape(kernel_max_dim, kernel_max_dim), in.data_type(), 1 };
 
     // Simple tensor for the transformation matrix
-    SimpleTensor<T> trans_matrix{ TensorShape(kernel_tile_dims[0], input_tile_w), in.data_type(), 1 };
+    SimpleTensor<T> trans_matrix{ TensorShape(kernel_max_dim, input_tile_max_dim), in.data_type(), 1 };
 
     // Simple tensor for the transformation matrix transpose
-    SimpleTensor<T> trans_matrix_transposed{ TensorShape(input_tile_w, kernel_tile_dims[0]), in.data_type(), 1 };
+    SimpleTensor<T> trans_matrix_transposed{ TensorShape(input_tile_max_dim, kernel_max_dim), in.data_type(), 1 };
 
     // Simple tensor for the temporary tile
-    SimpleTensor<T> tmp_tile{ TensorShape(kernel_tile_dims[0], input_tile_w), in.data_type(), 1 };
+    SimpleTensor<T> tmp_tile{ TensorShape(kernel_max_dim, input_tile_max_dim), in.data_type(), 1 };
 
     // Simple tensor for the output tile
-    SimpleTensor<T> transf_tile{ TensorShape(input_tile_w, input_tile_w), in.data_type(), 1 };
+    SimpleTensor<T> transf_tile{ TensorShape(input_tile_max_dim, input_tile_max_dim), in.data_type(), 1 };
 
     // Initialize matrix for the filter transform
     initialize_matrix_transform(trans_matrix, output_tile_size, kernel_size, WinogradTransformType::FILTER);
@@ -300,6 +364,9 @@
     const int num_filters  = in.shape()[3];
     const int num_batches  = in.shape().total_size() / (kernel_size.area() * num_channels * num_filters);
 
+    // If we have a vertical filter (i.e. 1x3, 1x5,..), we need to take the elements along the y direction (step_y_transf_tile = width of the output tile)
+    const int step_y_transf_tile = kernel_size.width == 1 ? input_tile_max_dim : 1;
+
     for(int n = 0; n < num_batches; ++n)
     {
         for(int w = 0; w < num_filters; ++w)
@@ -321,7 +388,7 @@
                 // Store the values across the channels
                 for(unsigned int i = 0; i < input_tile_area; ++i)
                 {
-                    out[output_offset + i * num_filters * num_channels] = transf_tile[i];
+                    out[output_offset + i * num_filters * num_channels] = transf_tile[i * step_y_transf_tile];
                 }
             }
         }
@@ -333,8 +400,6 @@
 template <typename T>
 SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const SimpleTensor<T> &b, const TensorShape &output_shape, const WinogradInfo &winograd_info)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(winograd_info.output_data_layout != DataLayout::NCHW, "Only supported NCHW data format");
-
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        input_dimensions = winograd_info.input_dimensions;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
@@ -350,17 +415,21 @@
     const unsigned int out_tile_h = output_tile_size.height;
 
     ARM_COMPUTE_ERROR_ON(in.shape()[2] != (in_tile_w * in_tile_h));
-    ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]);
+    ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::CHANNEL)]);
+
+    // Get the maximum dimension from the tile size
+    const unsigned int in_tile_max_dim  = std::max(in_tile_w, in_tile_h);
+    const unsigned int out_tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
 
     // Compute tile dimensions
     // Input tile dimensions
-    TensorShape in_tile_dims(in_tile_w, in_tile_h);
+    TensorShape in_tile_dims(in_tile_max_dim, in_tile_max_dim);
 
     // Output tile dimensions
-    TensorShape out_tile_dims(output_tile_size.width, output_tile_size.height);
+    TensorShape out_tile_dims(out_tile_max_dim, out_tile_max_dim);
 
     // Transformation matrix dimensions
-    TensorShape tr_tile_dims(in_tile_w, output_tile_size.width);
+    TensorShape tr_tile_dims(in_tile_max_dim, out_tile_max_dim);
 
     // Create tensors
     // Simple tensor for the input tile
@@ -402,15 +471,24 @@
     const int stridez_out = stridey_out * h_out;
     const int stridew_out = stridez_out * c_out;
 
-    // Compute number of elements to process in the X and Y direction
-    const int num_elements_x = input_dimensions.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
-    const int num_elements_y = input_dimensions.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
-    const int num_tiles_x    = std::ceil(num_elements_x / static_cast<float>(output_tile_size.width));
-    const int num_tiles_y    = std::ceil(num_elements_y / static_cast<float>(output_tile_size.height));
+    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
+    const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input_dimensions.width, input_dimensions.height),
+                                                                kernel_size,
+                                                                output_tile_size,
+                                                                conv_info);
+
+    const int num_tiles_x = num_tiles.width;
+    const int num_tiles_y = num_tiles.height;
 
     ARM_COMPUTE_UNUSED(num_tiles_y);
     ARM_COMPUTE_ERROR_ON(in.shape()[1] != static_cast<unsigned int>(num_tiles_x * num_tiles_y));
 
+    // If we have a vertical filter (i.e. 1x3, 1x5,..), we still need to take the elements along the x direction (step_y_transf_tile = 1)
+    const int step_y_transf_tile = kernel_size.width == 1 ? 1 : output_tile.shape()[0];
+
+    // Initialize with zeros the input tile
+    zeros(input_tile, Coordinates(0, 0), input_tile.shape());
+
     for(int n = 0; n < num_batches; ++n)
     {
         for(int y = 0; y < h_in; ++y)
@@ -443,7 +521,7 @@
                         // Check out-of-bound writes
                         if((xo + xi < w_out) && (yo + yi < h_out))
                         {
-                            out[output_offset + yi * stridey_out + xi] = output_tile[xi + yi * out_tile_w];
+                            out[output_offset + yi * stridey_out + xi] = output_tile[xi + yi * step_y_transf_tile];
 
                             // Add bias
                             out[output_offset + yi * stridey_out + xi] += b[zo];