arm_compute v19.11
diff --git a/tests/validation/reference/ActivationLayer.cpp b/tests/validation/reference/ActivationLayer.cpp
index f573d12..6cdba09 100644
--- a/tests/validation/reference/ActivationLayer.cpp
+++ b/tests/validation/reference/ActivationLayer.cpp
@@ -61,7 +61,7 @@
 
     SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
     SimpleTensor<float>   dst_tmp = activation_layer<float>(src_tmp, info);
-    SimpleTensor<uint8_t> dst     = convert_to_asymmetric(dst_tmp, dst_qinfo);
+    SimpleTensor<uint8_t> dst     = convert_to_asymmetric<uint8_t>(dst_tmp, dst_qinfo);
     return dst;
 }
 
diff --git a/tests/validation/reference/ActivationLayer.h b/tests/validation/reference/ActivationLayer.h
index 5beca7c..2d5dfdf 100644
--- a/tests/validation/reference/ActivationLayer.h
+++ b/tests/validation/reference/ActivationLayer.h
@@ -66,6 +66,9 @@
         case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
             ret = std::log(static_cast<T>(1) + std::exp(x));
             break;
+        case ActivationLayerInfo::ActivationFunction::ELU:
+            ret = (x > 0) ? x : a * (std::exp(x) - static_cast<T>(1));
+            break;
         case ActivationLayerInfo::ActivationFunction::SQRT:
             ret = std::sqrt(x);
             break;
diff --git a/tests/validation/reference/ArithmeticOperations.cpp b/tests/validation/reference/ArithmeticOperations.cpp
index abd4f31..0ec328e 100644
--- a/tests/validation/reference/ArithmeticOperations.cpp
+++ b/tests/validation/reference/ArithmeticOperations.cpp
@@ -112,7 +112,7 @@
 
         BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(op, src1_tmp, src2_tmp, dst_tmp, convert_policy, id_src1, id_src2, id_dst);
 
-        dst = convert_to_asymmetric(dst_tmp, dst.quantization_info());
+        dst = convert_to_asymmetric<uint8_t>(dst_tmp, dst.quantization_info());
         return dst;
     }
     else
diff --git a/tests/validation/reference/BoundingBoxTransform.cpp b/tests/validation/reference/BoundingBoxTransform.cpp
index 55dd165..e09bcff 100644
--- a/tests/validation/reference/BoundingBoxTransform.cpp
+++ b/tests/validation/reference/BoundingBoxTransform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,16 +36,16 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<T> bounding_box_transform(const SimpleTensor<T> &boxes, const SimpleTensor<T> &deltas, const BoundingBoxTransformInfo &info)
+template <typename T, typename TDeltas>
+SimpleTensor<T> bounding_box_transform(const SimpleTensor<T> &boxes, const SimpleTensor<TDeltas> &deltas, const BoundingBoxTransformInfo &info)
 {
-    const DataType  boxes_data_type = deltas.data_type();
+    const DataType  boxes_data_type = boxes.data_type();
     SimpleTensor<T> pred_boxes(deltas.shape(), boxes_data_type);
 
-    const size_t num_classes    = deltas.shape()[0] / 4;
-    const size_t num_boxes      = deltas.shape()[1];
-    const T     *deltas_ptr     = deltas.data();
-    T           *pred_boxes_ptr = pred_boxes.data();
+    const size_t   num_classes    = deltas.shape()[0] / 4;
+    const size_t   num_boxes      = deltas.shape()[1];
+    const TDeltas *deltas_ptr     = deltas.data();
+    T             *pred_boxes_ptr = pred_boxes.data();
 
     const int img_h = floor(info.img_height() / info.scale() + 0.5f);
     const int img_w = floor(info.img_width() / info.scale() + 0.5f);
@@ -70,15 +70,15 @@
         for(size_t j = 0; j < num_classes; ++j)
         {
             // Extract deltas
-            const size_t start_delta = i * num_classes * class_fields + class_fields * j;
-            const T      dx          = deltas_ptr[start_delta] / T(info.weights()[0]);
-            const T      dy          = deltas_ptr[start_delta + 1] / T(info.weights()[1]);
-            T            dw          = deltas_ptr[start_delta + 2] / T(info.weights()[2]);
-            T            dh          = deltas_ptr[start_delta + 3] / T(info.weights()[3]);
+            const size_t  start_delta = i * num_classes * class_fields + class_fields * j;
+            const TDeltas dx          = deltas_ptr[start_delta] / TDeltas(info.weights()[0]);
+            const TDeltas dy          = deltas_ptr[start_delta + 1] / TDeltas(info.weights()[1]);
+            TDeltas       dw          = deltas_ptr[start_delta + 2] / TDeltas(info.weights()[2]);
+            TDeltas       dh          = deltas_ptr[start_delta + 3] / TDeltas(info.weights()[3]);
 
             // Clip dw and dh
-            dw = std::min(dw, T(info.bbox_xform_clip()));
-            dh = std::min(dh, T(info.bbox_xform_clip()));
+            dw = std::min(dw, TDeltas(info.bbox_xform_clip()));
+            dh = std::min(dh, TDeltas(info.bbox_xform_clip()));
 
             // Determine the predictions
             const T pred_ctr_x = dx * width + ctr_x;
@@ -98,6 +98,16 @@
 
 template SimpleTensor<float> bounding_box_transform(const SimpleTensor<float> &boxes, const SimpleTensor<float> &deltas, const BoundingBoxTransformInfo &info);
 template SimpleTensor<half> bounding_box_transform(const SimpleTensor<half> &boxes, const SimpleTensor<half> &deltas, const BoundingBoxTransformInfo &info);
+
+template <>
+SimpleTensor<uint16_t> bounding_box_transform(const SimpleTensor<uint16_t> &boxes, const SimpleTensor<uint8_t> &deltas, const BoundingBoxTransformInfo &info)
+{
+    SimpleTensor<float>    boxes_tmp      = convert_from_asymmetric(boxes);
+    SimpleTensor<float>    deltas_tmp     = convert_from_asymmetric(deltas);
+    SimpleTensor<float>    pred_boxes_tmp = bounding_box_transform<float, float>(boxes_tmp, deltas_tmp, info);
+    SimpleTensor<uint16_t> pred_boxes     = convert_to_asymmetric<uint16_t>(pred_boxes_tmp, boxes.quantization_info());
+    return pred_boxes;
+}
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/BoundingBoxTransform.h b/tests/validation/reference/BoundingBoxTransform.h
index 33ef9d9..dbe2a14 100644
--- a/tests/validation/reference/BoundingBoxTransform.h
+++ b/tests/validation/reference/BoundingBoxTransform.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,8 +38,8 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<T> bounding_box_transform(const SimpleTensor<T> &boxes, const SimpleTensor<T> &deltas, const BoundingBoxTransformInfo &info);
+template <typename T, typename TDeltas>
+SimpleTensor<T> bounding_box_transform(const SimpleTensor<T> &boxes, const SimpleTensor<TDeltas> &deltas, const BoundingBoxTransformInfo &info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Box3x3.cpp b/tests/validation/reference/Box3x3.cpp
index 8d304a8..153f26a 100644
--- a/tests/validation/reference/Box3x3.cpp
+++ b/tests/validation/reference/Box3x3.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,8 +39,9 @@
 {
     SimpleTensor<T> dst(src.shape(), src.data_type());
     const std::array<T, 9> filter{ { 1, 1, 1, 1, 1, 1, 1, 1, 1 } };
-    const float scale = 1.f / static_cast<float>(filter.size());
-    for(int element_idx = 0; element_idx < src.num_elements(); ++element_idx)
+    const float    scale        = 1.f / static_cast<float>(filter.size());
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
     {
         const Coordinates id = index2coord(src.shape(), element_idx);
         apply_2d_spatial_filter(id, src, dst, TensorShape(3U, 3U), filter.data(), scale, border_mode, constant_border_value);
diff --git a/tests/validation/reference/ChannelCombine.cpp b/tests/validation/reference/ChannelCombine.cpp
index b76dcac..a6c0557 100644
--- a/tests/validation/reference/ChannelCombine.cpp
+++ b/tests/validation/reference/ChannelCombine.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -94,9 +94,10 @@
 
     for(unsigned int plane_idx = 0; plane_idx < dst.size(); ++plane_idx)
     {
-        SimpleTensor<T> &dst_tensor = dst[plane_idx];
+        SimpleTensor<T> &dst_tensor   = dst[plane_idx];
+        const uint32_t   num_elements = dst_tensor.num_elements();
 
-        for(int element_idx = 0; element_idx < dst_tensor.num_elements(); ++element_idx)
+        for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
         {
             Coordinates coord = index2coord(dst_tensor.shape(), element_idx);
 
diff --git a/tests/validation/reference/ColorConvert.cpp b/tests/validation/reference/ColorConvert.cpp
index 9090319..a759594 100644
--- a/tests/validation/reference/ColorConvert.cpp
+++ b/tests/validation/reference/ColorConvert.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -170,7 +170,7 @@
             {
                 case Format::RGB888:
                 case Format::RGBA8888:
-                    colorconvert_helper::detail::colorconvert_iyuv_to_rgb(shape, tensor_planes, dst[0]);
+                    colorconvert_helper::detail::colorconvert_iyuv_to_rgb(tensor_planes, dst[0]);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Not Supported");
@@ -185,7 +185,7 @@
             {
                 case Format::RGB888:
                 case Format::RGBA8888:
-                    colorconvert_helper::detail::colorconvert_nv12_to_rgb(shape, src_format, tensor_planes, dst[0]);
+                    colorconvert_helper::detail::colorconvert_nv12_to_rgb(src_format, tensor_planes, dst[0]);
                     break;
                 case Format::IYUV:
                     colorconvert_helper::detail::colorconvert_nv_to_iyuv(tensor_planes, src_format, dst);
diff --git a/tests/validation/reference/ColorConvertHelper.h b/tests/validation/reference/ColorConvertHelper.h
index abd1f5d..62eef82 100644
--- a/tests/validation/reference/ColorConvertHelper.h
+++ b/tests/validation/reference/ColorConvertHelper.h
@@ -306,7 +306,7 @@
 }
 
 template <typename T>
-inline void colorconvert_iyuv_to_rgb(const TensorShape &shape, const std::vector<SimpleTensor<T>> &tensor_planes, SimpleTensor<T> &dst)
+inline void colorconvert_iyuv_to_rgb(const std::vector<SimpleTensor<T>> &tensor_planes, SimpleTensor<T> &dst)
 {
     SimpleTensor<T> yvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
     SimpleTensor<T> uvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
@@ -361,7 +361,7 @@
 }
 
 template <typename T>
-inline void colorconvert_nv12_to_rgb(const TensorShape &shape, const Format format, const std::vector<SimpleTensor<T>> &tensor_planes, SimpleTensor<T> &dst)
+inline void colorconvert_nv12_to_rgb(const Format format, const std::vector<SimpleTensor<T>> &tensor_planes, SimpleTensor<T> &dst)
 {
     SimpleTensor<T> yvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
     SimpleTensor<T> uvec(TensorShape{ tensor_planes[0].shape().x() / 2, tensor_planes[0].shape().y() }, Format::U8);
@@ -437,15 +437,15 @@
     SimpleTensor<T> utmp(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
     SimpleTensor<T> vtmp(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
 
-    int utmp_width  = utmp.shape().x();
-    int utmp_height = utmp.shape().y();
+    uint32_t utmp_width  = utmp.shape().x();
+    uint32_t utmp_height = utmp.shape().y();
 
-    int         uvec_coord_x = 0;
-    int         uvec_coord_y = 0;
+    uint32_t    uvec_coord_x = 0;
+    uint32_t    uvec_coord_y = 0;
     Coordinates uvec_coord{ uvec_coord_x, uvec_coord_y };
-    for(int y = 0; y < utmp_height; y++)
+    for(uint32_t y = 0; y < utmp_height; y++)
     {
-        for(int x = 0; x < utmp_width; x++)
+        for(uint32_t x = 0; x < utmp_width; x++)
         {
             Coordinates coord{ x, y };
             auto       *utmp_pixel = reinterpret_cast<T *>(utmp(coord));
@@ -464,15 +464,15 @@
         }
     }
 
-    int second_plane_x = dst[1].shape().x();
-    int second_plane_y = dst[1].shape().y();
+    uint32_t second_plane_x = dst[1].shape().x();
+    uint32_t second_plane_y = dst[1].shape().y();
 
-    int utmp_coord_x = 0;
-    int utmp_coord_y = 0;
+    uint32_t utmp_coord_x = 0;
+    uint32_t utmp_coord_y = 0;
 
-    for(int y = 0; y < second_plane_y; y++)
+    for(uint32_t y = 0; y < second_plane_y; y++)
     {
-        for(int x = 0; x < second_plane_x; x++)
+        for(uint32_t x = 0; x < second_plane_x; x++)
         {
             Coordinates coord{ x, y };
             Coordinates utmp_top_coord{ utmp_coord_x, utmp_coord_y };
@@ -524,15 +524,15 @@
 
     SimpleTensor<T> utmp(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
     SimpleTensor<T> vtmp(TensorShape{ src.shape().x() / 2, src.shape().y() }, Format::U8);
-    int             utmp_width  = utmp.shape().x();
-    int             utmp_height = utmp.shape().y();
+    uint32_t        utmp_width  = utmp.shape().x();
+    uint32_t        utmp_height = utmp.shape().y();
 
-    int         uvec_coord_x = 0;
-    int         uvec_coord_y = 0;
+    uint32_t    uvec_coord_x = 0;
+    uint32_t    uvec_coord_y = 0;
     Coordinates uvec_coord{ uvec_coord_x, uvec_coord_y };
-    for(int y = 0; y < utmp_height; y++)
+    for(uint32_t y = 0; y < utmp_height; y++)
     {
-        for(int x = 0; x < utmp_width; x++)
+        for(uint32_t x = 0; x < utmp_width; x++)
         {
             Coordinates coord{ x, y };
             auto       *utmp_pixel = reinterpret_cast<T *>(utmp(coord));
@@ -551,15 +551,15 @@
         }
     }
 
-    int second_plane_x = dst[1].shape().x();
-    int second_plane_y = dst[1].shape().y();
+    uint32_t second_plane_x = dst[1].shape().x();
+    uint32_t second_plane_y = dst[1].shape().y();
 
-    int utmp_coord_x = 0;
-    int utmp_coord_y = 0;
+    uint32_t utmp_coord_x = 0;
+    uint32_t utmp_coord_y = 0;
 
-    for(int y = 0; y < second_plane_y; y++)
+    for(uint32_t y = 0; y < second_plane_y; y++)
     {
-        for(int x = 0; x < second_plane_x; x++)
+        for(uint32_t x = 0; x < second_plane_x; x++)
         {
             Coordinates coord{ x, y };
             Coordinates utmp_top_coord{ utmp_coord_x, utmp_coord_y };
diff --git a/tests/validation/reference/ComputeAllAnchors.cpp b/tests/validation/reference/ComputeAllAnchors.cpp
index 3f04980..60be7ef 100644
--- a/tests/validation/reference/ComputeAllAnchors.cpp
+++ b/tests/validation/reference/ComputeAllAnchors.cpp
@@ -73,6 +73,15 @@
 }
 template SimpleTensor<float> compute_all_anchors(const SimpleTensor<float> &anchors, const ComputeAnchorsInfo &info);
 template SimpleTensor<half> compute_all_anchors(const SimpleTensor<half> &anchors, const ComputeAnchorsInfo &info);
+
+template <>
+SimpleTensor<int16_t> compute_all_anchors(const SimpleTensor<int16_t> &anchors, const ComputeAnchorsInfo &info)
+{
+    SimpleTensor<float>   anchors_tmp     = convert_from_symmetric(anchors);
+    SimpleTensor<float>   all_anchors_tmp = compute_all_anchors(anchors_tmp, info);
+    SimpleTensor<int16_t> all_anchors     = convert_to_symmetric<int16_t>(all_anchors_tmp, anchors.quantization_info());
+    return all_anchors;
+}
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ConvertFullyConnectedWeights.cpp b/tests/validation/reference/ConvertFullyConnectedWeights.cpp
index e27846c..5925496 100644
--- a/tests/validation/reference/ConvertFullyConnectedWeights.cpp
+++ b/tests/validation/reference/ConvertFullyConnectedWeights.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,7 +48,8 @@
     const unsigned int factor_1                  = is_nchw_to_nhwc ? num_elems_per_input_plane : num_channels;
     const unsigned int factor_2                  = is_nchw_to_nhwc ? num_channels : num_elems_per_input_plane;
 
-    for(int i = 0; i < src.num_elements(); ++i)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         const Coordinates coords_in = index2coords(src.shape(), i);
         const Coordinates coords_out(coords_in.x(), coords_in.y() % factor_1 * factor_2 + coords_in.y() / factor_1);
diff --git a/tests/validation/reference/Convolution.cpp b/tests/validation/reference/Convolution.cpp
index e3d4b4a..1083d50 100644
--- a/tests/validation/reference/Convolution.cpp
+++ b/tests/validation/reference/Convolution.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,8 +44,8 @@
 
     SimpleTensor<T>       dst(src.shape(), output_data_type);
     SimpleTensor<int32_t> sum(src.shape(), output_data_type);
-
-    for(int element_idx = 0; element_idx < src.num_elements(); ++element_idx)
+    const uint32_t        num_elements = src.num_elements();
+    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
     {
         const Coordinates id = index2coord(src.shape(), element_idx);
         apply_2d_spatial_filter(id, src, sum, TensorShape(width, height), conv, 1, border_mode, constant_border_value);
diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h
index 30be25f..23918a4 100644
--- a/tests/validation/reference/Convolution3d.h
+++ b/tests/validation/reference/Convolution3d.h
@@ -42,13 +42,16 @@
 }
 
 // 3D convolution for floating point type
-template < typename T, typename TB, typename std::enable_if < validation::is_floating_point<T>::value &&validation::is_floating_point<TB>::value, int >::type = 0 >
-inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
+template < typename T, typename TW, typename TB, typename std::enable_if < validation::is_floating_point<T>::value &&validation::is_floating_point<TW>::value
+                                                                           &&validation::is_floating_point<TB>::value,
+                                                                           int >::type = 0 >
+inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
                           int i_offset, int w_offset, int b_offset, int o_offset,
-                          int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1)
+                          int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1, int filter_id = 0)
 {
+    ARM_COMPUTE_UNUSED(filter_id);
     const T *in_ptr  = in.data() + i_offset;
-    const T *w_ptr   = weights.data() + w_offset;
+    const TW *w_ptr   = weights.data() + w_offset;
     const TB *b_ptr   = bias.data() + b_offset;
     T        *out_ptr = out.data() + o_offset;
 
@@ -77,8 +80,8 @@
                     const int idx = xk + half_width_weights_start;
                     const int idy = yk + half_height_weights_start;
 
-                    const T i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
-                    const T w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
+                    const T  i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
+                    const TW w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
 
                     acc += i_value * w_value;
                 }
@@ -91,13 +94,16 @@
 }
 
 // 3D convolution for QASYMM8 type
-template < typename T, typename TB, typename std::enable_if < std::is_same<T, uint8_t>::value &&std::is_same<TB, int32_t>::value, int >::type = 0 >
-inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
+template < typename T, typename TW, typename TB, typename std::enable_if < std::is_same<T, uint8_t>::value &&(std::is_same<TW, uint8_t>::value
+                                                                                                              || std::is_same<TW, int8_t>::value)
+                                                                           &&std::is_same<TB, int32_t>::value,
+                                                                           int >::type = 0 >
+inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
                           int i_offset, int w_offset, int b_offset, int o_offset,
-                          int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1)
+                          int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1, int filter_id = 0)
 {
     const T *in_ptr  = in.data() + i_offset;
-    const T *w_ptr   = weights.data() + w_offset;
+    const TW *w_ptr   = weights.data() + w_offset;
     const TB *b_ptr   = bias.data() + b_offset;
     T        *out_ptr = out.data() + o_offset;
 
@@ -107,10 +113,22 @@
 
     const int   input_offset   = -iq_info.offset;
     const float input_scale    = iq_info.scale;
-    const int   weights_offset = -wq_info.offset;
-    const float weights_scale  = wq_info.scale;
-    const int   output_offset  = oq_info.offset;
-    const float output_scale   = oq_info.scale;
+    int         weights_offset = -wq_info.offset;
+    float       weights_scale  = wq_info.scale;
+    if(is_data_type_quantized_per_channel(weights.data_type()))
+    {
+        if(is_data_type_quantized_asymmetric(weights.data_type()))
+        {
+            weights_offset = weights.quantization_info().offset()[filter_id];
+        }
+        else
+        {
+            weights_offset = 0;
+        }
+        weights_scale = weights.quantization_info().scale()[filter_id];
+    }
+    const int   output_offset = oq_info.offset;
+    const float output_scale  = oq_info.scale;
 
     int         output_multiplier = 0;
     int         output_shift      = 0;
@@ -142,9 +160,8 @@
                     const int idx = xk + half_width_weights_start;
                     const int idy = yk + half_height_weights_start;
 
-                    const uint8_t i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
-                    const uint8_t w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
-
+                    const int32_t i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
+                    const int32_t w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
                     acc += (i_value + input_offset) * (w_value + weights_offset);
                 }
             }
diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index 6909011..4d2c1ac 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp
@@ -45,8 +45,8 @@
 {
 } // namespace
 
-template <typename T, typename TB>
-SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, const PadStrideInfo &info,
+template <typename T, typename TW, typename TB>
+SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, const PadStrideInfo &info,
                                        const Size2D &dilation, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON((src.shape()[2] / num_groups) != weights.shape()[2]);
@@ -73,7 +73,6 @@
     const int end_xi      = output_wh.first * stride_xi;
     const int end_yi      = output_wh.second * stride_yi;
     const int num_batches = src.shape().total_size() / (width_in * height_in * depth_in);
-
     for(int r = 0; r < num_batches; ++r)
     {
         for(int yi = start_yi; yi < start_yi + end_yi; yi += stride_yi)
@@ -100,17 +99,16 @@
                                                               offset_in, offset_w, offset_b, offset_out,
                                                               xi, yi,
                                                               width_in, height_in, (depth_in / num_groups),
-                                                              width_weights, height_weights, dilation.x(), dilation.y());
+                                                              width_weights, height_weights, dilation.x(), dilation.y(), ofm);
                     }
                 }
             }
         }
     }
-
     return dst;
 }
-template <typename T, typename TB>
-SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
+template <typename T, typename TW, typename TB>
+SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
                                   const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info)
 {
     // if no explicit quantization has been set you the same as src
@@ -123,9 +121,9 @@
 
     if(src.data_layout() == DataLayout::NHWC)
     {
-        SimpleTensor<T> src_nchw     = reference::permute<T>(src, PermutationVector(1U, 2U, 0U));
-        SimpleTensor<T> weights_nchw = reference::permute<T>(weights, PermutationVector(1U, 2U, 0U));
-        SimpleTensor<T> dst_nchw     = reference::permute<T>(dst, PermutationVector(1U, 2U, 0U));
+        SimpleTensor<T>  src_nchw     = reference::permute<T>(src, PermutationVector(1U, 2U, 0U));
+        SimpleTensor<TW> weights_nchw = reference::permute<TW>(weights, PermutationVector(1U, 2U, 0U));
+        SimpleTensor<T>  dst_nchw     = reference::permute<T>(dst, PermutationVector(1U, 2U, 0U));
 
         return reference::permute<T>(convolution_layer_nchw(src_nchw, weights_nchw, bias, dst_nchw, info, dilation, num_groups), PermutationVector(2U, 0U, 1U));
     }
@@ -141,6 +139,8 @@
                                               const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info);
 template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
                                                  const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info);
+template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<int8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
+                                                 const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ConvolutionLayer.h b/tests/validation/reference/ConvolutionLayer.h
index c51a9b3..8f41073 100644
--- a/tests/validation/reference/ConvolutionLayer.h
+++ b/tests/validation/reference/ConvolutionLayer.h
@@ -35,8 +35,8 @@
 {
 namespace reference
 {
-template <typename T, typename TB>
-SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
+template <typename T, typename TW, typename TB>
+SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
                                   const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, QuantizationInfo out_quant_info = QuantizationInfo());
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/CropResize.cpp b/tests/validation/reference/CropResize.cpp
index f25a031..68ee455 100644
--- a/tests/validation/reference/CropResize.cpp
+++ b/tests/validation/reference/CropResize.cpp
@@ -123,7 +123,7 @@
 template <typename T>
 SimpleTensor<float> crop_image(const SimpleTensor<T> &src, Coordinates start, Coordinates end, int32_t batch_index, float extrapolation_value)
 {
-    TensorShape out_shape(src.shape()[0], abs(end[0] - start[0]) + 1, abs(end[1] - start[1]) + 1);
+    TensorShape out_shape(src.shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
 
     SimpleTensor<float> out{ out_shape, DataType::F32, 1, QuantizationInfo(), DataLayout::NHWC };
 
diff --git a/tests/validation/reference/DFT.cpp b/tests/validation/reference/DFT.cpp
index 6ad1b9e..b3c2c6b 100644
--- a/tests/validation/reference/DFT.cpp
+++ b/tests/validation/reference/DFT.cpp
@@ -237,11 +237,11 @@
 template <typename T>
 SimpleTensor<T> complex_mul_and_reduce(const SimpleTensor<T> &input, const SimpleTensor<T> &weights)
 {
-    const int W  = input.shape().x();
-    const int H  = input.shape().y();
-    const int Ci = input.shape().z();
-    const int Co = weights.shape()[3];
-    const int N  = input.shape().total_size() / (W * H * Ci);
+    const uint32_t W  = input.shape().x();
+    const uint32_t H  = input.shape().y();
+    const uint32_t Ci = input.shape().z();
+    const uint32_t Co = weights.shape()[3];
+    const uint32_t N  = input.shape().total_size() / (W * H * Ci);
 
     TensorShape output_shape = input.shape();
     output_shape.set(2, Co);
@@ -250,19 +250,19 @@
     // MemSet dst memory to zero
     std::memset(dst.data(), 0, dst.size());
 
-    for(int b = 0; b < N; ++b)
+    for(uint32_t b = 0; b < N; ++b)
     {
-        for(int co = 0; co < Co; ++co)
+        for(uint32_t co = 0; co < Co; ++co)
         {
-            for(int ci = 0; ci < Ci; ++ci)
+            for(uint32_t ci = 0; ci < Ci; ++ci)
             {
-                for(int h = 0; h < H; ++h)
+                for(uint32_t h = 0; h < H; ++h)
                 {
-                    for(int w = 0; w < W; ++w)
+                    for(uint32_t w = 0; w < W; ++w)
                     {
-                        size_t            i_index  = w + h * W + ci * H * W + b * H * W * Ci;
-                        size_t            w_index  = w + h * W + ci * H * W + co * H * W * Ci;
-                        size_t            o_index  = w + h * W + co * H * W + b * H * W * Co;
+                        const uint32_t    i_index  = w + h * W + ci * H * W + b * H * W * Ci;
+                        const uint32_t    w_index  = w + h * W + ci * H * W + co * H * W * Ci;
+                        const uint32_t    o_index  = w + h * W + co * H * W + b * H * W * Co;
                         const Coordinates i_coords = index2coords(input.shape(), i_index);
                         const Coordinates w_coords = index2coords(weights.shape(), w_index);
                         const Coordinates o_coords = index2coords(dst.shape(), o_index);
diff --git a/tests/validation/reference/DeconvolutionLayer.cpp b/tests/validation/reference/DeconvolutionLayer.cpp
index af59830..0e0ea57 100644
--- a/tests/validation/reference/DeconvolutionLayer.cpp
+++ b/tests/validation/reference/DeconvolutionLayer.cpp
@@ -35,24 +35,47 @@
 {
 template <typename T, typename TB>
 SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape,
-                                    const PadStrideInfo &info)
+                                    const PadStrideInfo &info, QuantizationInfo out_qinfo)
 {
     // Create reference
-    const int stride_x           = info.stride().first;
-    const int stride_y           = info.stride().second;
-    const int weights_width      = weights.shape().x();
-    const int weights_height     = weights.shape().y();
-    const int weights_upper_dims = weights.shape().total_size() / (weights_width * weights_height);
+    const unsigned int pad_left   = info.pad_left();
+    const unsigned int pad_right  = info.pad_right();
+    const unsigned int pad_top    = info.pad_top();
+    const unsigned int pad_bottom = info.pad_bottom();
+    const int stride_x            = info.stride().first;
+    const int stride_y            = info.stride().second;
+    const int weights_width       = weights.shape().x();
+    const int weights_height      = weights.shape().y();
+    const int weights_upper_dims  = weights.shape().total_size() / (weights_width * weights_height);
+
+    ARM_COMPUTE_ERROR_ON(pad_left   > (weights.shape().x() - 1));
+    ARM_COMPUTE_ERROR_ON(pad_right  > (weights.shape().x() - 1));
+    ARM_COMPUTE_ERROR_ON(pad_top    > (weights.shape().y() - 1));
+    ARM_COMPUTE_ERROR_ON(pad_bottom > (weights.shape().y() - 1));
 
     // Find the upsampled dimensions
     unsigned int out_x = (src.shape().x() - 1) * stride_x + 1;
     unsigned int out_y = (src.shape().y() - 1) * stride_y + 1;
 
     // Find the padding needed for the convolution with stride 1 in order to match output shape
-    unsigned int padx = output_shape.x() - (out_x - weights_width + 1);
-    unsigned int pady = output_shape.y() - (out_y - weights_height + 1);
-    out_x += padx;
-    out_y += pady;
+    unsigned int deconv_pad_x = output_shape.x() - (out_x - weights_width + 1);
+    unsigned int deconv_pad_y = output_shape.y() - (out_y - weights_height + 1);
+    out_x += deconv_pad_x;
+    out_y += deconv_pad_y;
+
+    unsigned int deconv_pad_left  = pad_right > pad_left ? pad_right - pad_left : 0;
+    unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
+    deconv_pad_x -= deconv_pad_left + deconv_pad_right;
+    ARM_COMPUTE_ERROR_ON((deconv_pad_x % 2) != 0);
+    deconv_pad_left  += deconv_pad_x / 2;
+    deconv_pad_right += deconv_pad_x / 2;
+
+    unsigned int deconv_pad_top    = pad_bottom > pad_top ? pad_bottom - pad_top : 0;
+    unsigned int deconv_pad_bottom = pad_top > pad_bottom ? pad_top - pad_bottom : 0;
+    deconv_pad_y -= deconv_pad_top + deconv_pad_bottom;
+    ARM_COMPUTE_ERROR_ON((deconv_pad_y % 2) != 0);
+    deconv_pad_top    += deconv_pad_y / 2;
+    deconv_pad_bottom += deconv_pad_y / 2;
 
     TensorShape scaled_shape = src.shape();
     scaled_shape.set(0, out_x);
@@ -64,7 +87,6 @@
     const int width_scaled  = scaled.shape().x();
     const int height_scaled = scaled.shape().y();
     const int num_2d_slices = src.shape().total_size() / (width_in * height_in);
-    ARM_COMPUTE_ERROR_ON(info.pad().first > (weights.shape().x() - 1));
 
     if(src.data_type() == DataType::QASYMM8)
     {
@@ -94,10 +116,10 @@
     {
         const int offset_slice_in  = slice * width_in * height_in;
         const int offset_slice_out = slice * width_scaled * height_scaled;
-        const int start_x          = padx / 2;
-        const int start_y          = pady / 2;
-        const int end_y            = height_scaled - pady / 2;
-        const int end_x            = width_scaled - padx / 2;
+        const int start_x          = deconv_pad_left;
+        const int start_y          = deconv_pad_top;
+        const int end_x            = width_scaled - deconv_pad_right;
+        const int end_y            = height_scaled - deconv_pad_bottom;
 
         for(int yi = start_y, in_y = 0; yi < end_y; yi += stride_y, in_y++)
         {
@@ -111,15 +133,15 @@
     }
 
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-    return convolution_layer(scaled, weights_flipped, bias, output_shape, conv_info);
+    return convolution_layer(scaled, weights_flipped, bias, output_shape, conv_info, Size2D(1U, 1U), 1, out_qinfo);
 }
 
 template SimpleTensor<uint8_t> deconvolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
-                                                   const PadStrideInfo &info);
+                                                   const PadStrideInfo &info, QuantizationInfo out_quant_info);
 template SimpleTensor<float> deconvolution_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
-                                                 const PadStrideInfo &info);
+                                                 const PadStrideInfo &info, QuantizationInfo out_quant_info);
 template SimpleTensor<half> deconvolution_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, const TensorShape &output_shape,
-                                                const PadStrideInfo &info);
+                                                const PadStrideInfo &info, QuantizationInfo out_quant_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DeconvolutionLayer.h b/tests/validation/reference/DeconvolutionLayer.h
index 21583e3..e7c2f9d 100644
--- a/tests/validation/reference/DeconvolutionLayer.h
+++ b/tests/validation/reference/DeconvolutionLayer.h
@@ -46,7 +46,8 @@
  *
  */
 template <typename T, typename TB>
-SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info);
+SimpleTensor<T> deconvolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
+                                    QuantizationInfo out_qinfo = QuantizationInfo());
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.cpp b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
index b1d2b92..608093d 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
@@ -40,7 +40,9 @@
 {
 namespace reference
 {
-/** Perform a depthwise convolution
+namespace
+{
+/** Perform a depthwise convolution for floating-point types
  *
  * - Three dimensions tensors
  * - Third dimention is number of channels
@@ -48,9 +50,9 @@
  * - Padding, stride and output shape "match"
  *
  */
-template <typename T, typename TB>
-SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info,
-                                      unsigned int depth_multiplier, const Size2D &dilation, const QuantizationInfo &out_quant_info)
+template <typename T>
+SimpleTensor<T> depthwise_convolution_fp(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<T> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info,
+                                         unsigned int depth_multiplier, const Size2D &dilation, const QuantizationInfo &out_quant_info)
 {
     ARM_COMPUTE_UNUSED(out_quant_info);
 
@@ -114,7 +116,7 @@
                             }
                         }
 
-                        dst[out_pos++] = saturate_cast<T>(val + *static_cast<const TB *>(biases(Coordinates(out_z))));
+                        dst[out_pos++] = saturate_cast<T>(val + *static_cast<const T *>(biases(Coordinates(out_z))));
                     }
                 }
             }
@@ -124,26 +126,32 @@
     return dst;
 }
 
-template <>
-SimpleTensor<uint8_t> depthwise_convolution(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &biases, const TensorShape &dst_shape,
-                                            const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const QuantizationInfo &out_quant_info)
+/** Perform a quantized depthwise convolution
+ *
+ * - Three dimensions tensors
+ * - Third dimention is number of channels
+ * - Depths of input tensor and filter are equals
+ * - Padding, stride and output shape "match"
+ * - QASYMM8 input, output
+ * - QASYMM8 or QSYMM8_PER_CHANNEL filter
+ *
+ */
+template <typename T, typename TW, typename TB>
+SimpleTensor<T> depthwise_convolution_quantized(const SimpleTensor<T> &src, const SimpleTensor<TW> &weights, const SimpleTensor<int32_t> &biases, const TensorShape &dst_shape,
+                                                const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const QuantizationInfo &out_quant_info)
 {
     // if no explicit quantization has been set you the same as src
     const QuantizationInfo &dst_qinfo = out_quant_info.uniform().empty() ? src.quantization_info() : out_quant_info;
-    SimpleTensor<uint8_t>   dst{ dst_shape, src.data_type(), 1, dst_qinfo };
+    SimpleTensor<T>         dst{ dst_shape, src.data_type(), 1, dst_qinfo };
 
     // Create reference
     const int   input_offset   = -src.quantization_info().uniform().offset;
     const float input_scale    = src.quantization_info().uniform().scale;
     const int   weights_offset = -weights.quantization_info().uniform().offset;
-    const float weights_scale  = weights.quantization_info().uniform().scale;
     const int   output_offset  = dst_qinfo.uniform().offset;
     const float output_scale   = dst_qinfo.uniform().scale;
 
-    int         output_multiplier = 0;
-    int         output_shift      = 0;
-    const float multiplier        = input_scale * weights_scale / output_scale;
-    arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+    const std::vector<float> weights_scale_vec = weights.quantization_info().scale();
 
     // Compute reference
     const int filter_width  = weights.shape().x();
@@ -173,6 +181,8 @@
     const int maximum_x = input_width + pad_left + pad_right - static_cast<int>(patch_width);
     const int maximum_y = input_height + pad_top + pad_bottom - static_cast<int>(patch_height);
 
+    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights.data_type());
+
     int out_pos = 0;
     for(int r = 0; r < num_batches; ++r)
     {
@@ -183,6 +193,12 @@
                 const int     out_z    = z * depth_multiplier + m;
                 const int32_t bias_val = *static_cast<const int32_t *>(biases(Coordinates(out_z)));
 
+                int         output_multiplier = 0;
+                int         output_shift      = 0;
+                const float weights_scale     = (is_quantized_per_channel) ? weights_scale_vec[out_z] : weights_scale_vec[0];
+                const float multiplier        = input_scale * weights_scale / output_scale;
+                arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
                 for(int y = minimum_y; y <= minimum_y + maximum_y; y += conv_info.stride().second)
                 {
                     for(int x = minimum_x; x <= minimum_x + maximum_x; x += conv_info.stride().first)
@@ -197,8 +213,8 @@
                             {
                                 coords.set(0, i);
                                 coords.set(1, j);
-                                const auto    in_val = tensor_elem_at<uint8_t>(src, coords, BorderMode::CONSTANT, -input_offset);
-                                const uint8_t w_val  = *(weights.data() + filter_offset);
+                                const auto in_val = tensor_elem_at<T>(src, coords, BorderMode::CONSTANT, -input_offset);
+                                const TW   w_val  = *(weights.data() + filter_offset);
                                 val += (in_val + input_offset) * (w_val + weights_offset);
                                 ++filter_offset;
                             }
@@ -206,8 +222,7 @@
                         val += bias_val;
                         val = asymm_rounding_divide_by_pow2(asymm_int_mult(val, output_multiplier), output_shift);
                         val += output_offset;
-                        val = std::max<int32_t>(val, 0);
-                        val = std::min<int32_t>(val, 255);
+                        val = utility::clamp<int32_t>(val, 0, 255);
 
                         // Store the result
                         dst[out_pos++] = val;
@@ -219,12 +234,35 @@
 
     return dst;
 }
+} // namespace
 
-template SimpleTensor<float> depthwise_convolution(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &biases, const TensorShape &dst_shape,
-                                                   const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const QuantizationInfo &out_quant_info);
+template <>
+SimpleTensor<float> depthwise_convolution(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &biases, const TensorShape &dst_shape,
+                                          const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const QuantizationInfo &out_quant_info)
+{
+    return depthwise_convolution_fp(src, weights, biases, dst_shape, conv_info, depth_multiplier, dilation, out_quant_info);
+}
 
-template SimpleTensor<half> depthwise_convolution(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &biases, const TensorShape &dst_shape,
-                                                  const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const QuantizationInfo &out_quant_info);
+template <>
+SimpleTensor<half> depthwise_convolution(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &biases, const TensorShape &dst_shape,
+                                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const QuantizationInfo &out_quant_info)
+{
+    return depthwise_convolution_fp(src, weights, biases, dst_shape, conv_info, depth_multiplier, dilation, out_quant_info);
+}
+
+template <>
+SimpleTensor<uint8_t> depthwise_convolution(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &biases, const TensorShape &dst_shape,
+                                            const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const QuantizationInfo &out_quant_info)
+{
+    return depthwise_convolution_quantized<uint8_t, uint8_t, int32_t>(src, weights, biases, dst_shape, conv_info, depth_multiplier, dilation, out_quant_info);
+}
+
+template <>
+SimpleTensor<uint8_t> depthwise_convolution(const SimpleTensor<uint8_t> &src, const SimpleTensor<int8_t> &weights, const SimpleTensor<int32_t> &biases, const TensorShape &dst_shape,
+                                            const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, const QuantizationInfo &out_quant_info)
+{
+    return depthwise_convolution_quantized<uint8_t, int8_t, int32_t>(src, weights, biases, dst_shape, conv_info, depth_multiplier, dilation, out_quant_info);
+}
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.h b/tests/validation/reference/DepthwiseConvolutionLayer.h
index ee323fa..38a225a 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.h
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.h
@@ -35,8 +35,8 @@
 {
 namespace reference
 {
-template <typename T, typename TB>
-SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info,
+template <typename T, typename TW, typename TB>
+SimpleTensor<T> depthwise_convolution(const SimpleTensor<T> &src, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &biases, const TensorShape &dst_shape, const PadStrideInfo &conv_info,
                                       unsigned int depth_multiplier, const Size2D &dilation = Size2D(1U, 1U), const QuantizationInfo &out_quant_info = QuantizationInfo(0.0f, 0));
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp b/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp
deleted file mode 100644
index 8bc6ddb..0000000
--- a/tests/validation/reference/DepthwiseSeparableConvolutionLayer.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "DepthwiseConvolutionLayer.h"
-
-#include "DepthwiseSeparableConvolutionLayer.h"
-
-#include "ConvolutionLayer.h"
-#include "Utils.h"
-
-#include "tests/validation/Helpers.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-// Depthwise separable convolution layer
-template <typename T>
-SimpleTensor<T> depthwise_separable_convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &depthwise_weights, const SimpleTensor<T> &depthwise_biases,
-                                                      const TensorShape     &depthwise_out_shape,
-                                                      const SimpleTensor<T> &pointwise_weights,
-                                                      const SimpleTensor<T> &pointwise_biases, const TensorShape &dst_shape, const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
-{
-    // Compute reference
-    SimpleTensor<T> depthwise_out = depthwise_convolution(src, depthwise_weights, depthwise_biases, depthwise_out_shape, depthwise_conv_info, 1);
-    SimpleTensor<T> dst           = convolution_layer(depthwise_out, pointwise_weights, pointwise_biases, dst_shape, pointwise_conv_info);
-
-    return dst;
-}
-
-template SimpleTensor<float> depthwise_separable_convolution_layer(const SimpleTensor<float> &in, const SimpleTensor<float> &depthwise_weights, const SimpleTensor<float> &depthwise_biases,
-                                                                   const TensorShape         &depthwise_out_shape,
-                                                                   const SimpleTensor<float> &pointwise_weights, const SimpleTensor<float> &pointwise_biases, const TensorShape &dst_shape, const PadStrideInfo &depthwise_conv_info,
-                                                                   const PadStrideInfo &pointwise_conv_info);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/DequantizationLayer.cpp b/tests/validation/reference/DequantizationLayer.cpp
index cceee04..16f25c4 100644
--- a/tests/validation/reference/DequantizationLayer.cpp
+++ b/tests/validation/reference/DequantizationLayer.cpp
@@ -50,16 +50,16 @@
 {
     return static_cast<TOut>(dequantize_qsymm16(val, qinfo));
 }
-
+} // namespace
 template <typename TOut, typename TIn>
-SimpleTensor<TOut> dequantization_layer_nchw(const SimpleTensor<TIn> &src)
+SimpleTensor<TOut> dequantization_layer(const SimpleTensor<TIn> &src)
 {
     const DataType src_data_type = src.data_type();
     const DataType dst_data_type = std::is_same<TOut, float>::value ? DataType::F32 : DataType::F16;
 
     SimpleTensor<TOut> dst{ src.shape(), dst_data_type };
 
-    if(src_data_type == DataType::QSYMM8_PER_CHANNEL)
+    if(is_data_type_quantized_per_channel(src_data_type))
     {
         const int WH = src.shape().x() * src.shape().y();
         const int C  = src.shape().z();
@@ -95,20 +95,6 @@
 
     return dst;
 }
-} // namespace
-template <typename TOut, typename TIn>
-SimpleTensor<TOut> dequantization_layer(const SimpleTensor<TIn> &src)
-{
-    if(src.data_layout() == DataLayout::NHWC && src.data_type() == DataType::QSYMM8_PER_CHANNEL)
-    {
-        SimpleTensor<TIn> src_nchw = reference::permute<TIn>(src, PermutationVector(1U, 2U, 0U));
-        return reference::permute<TOut>(dequantization_layer_nchw<TOut>(src_nchw), PermutationVector(2U, 0U, 1U));
-    }
-    else
-    {
-        return dequantization_layer_nchw<TOut>(src);
-    }
-}
 
 template SimpleTensor<half> dequantization_layer(const SimpleTensor<uint8_t> &src);
 template SimpleTensor<float> dequantization_layer(const SimpleTensor<uint8_t> &src);
diff --git a/tests/validation/reference/Derivative.cpp b/tests/validation/reference/Derivative.cpp
index 0ef8fc2..3c6f325 100644
--- a/tests/validation/reference/Derivative.cpp
+++ b/tests/validation/reference/Derivative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,8 @@
 
     ValidRegion valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(filter_size / 2));
 
-    for(int i = 0; i < src.num_elements(); ++i)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         Coordinates coord = index2coord(src.shape(), i);
 
diff --git a/tests/validation/reference/Dilate.cpp b/tests/validation/reference/Dilate.cpp
index 0683a0a..8e244e9 100644
--- a/tests/validation/reference/Dilate.cpp
+++ b/tests/validation/reference/Dilate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,8 +50,9 @@
         dst(x, y) = max[ src(x', y') for x-1<=x'<=x+1, y-1<=y'<=y+1 ] = max({tl, tc, tr, ml, xy, mr, bl, bc, br})
     */
     SimpleTensor<T> dst(src.shape(), src.data_type());
+    const uint32_t  num_elements = src.num_elements();
 
-    for(int i = 0; i < src.num_elements(); ++i)
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         Coordinates coord = index2coord(src.shape(), i);
         const int   x     = coord.x();
diff --git a/tests/validation/reference/ElementwiseOperations.cpp b/tests/validation/reference/ElementwiseOperations.cpp
index d5a37a0..7b39e18 100644
--- a/tests/validation/reference/ElementwiseOperations.cpp
+++ b/tests/validation/reference/ElementwiseOperations.cpp
@@ -168,7 +168,7 @@
 
         BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(op, src1_tmp, src2_tmp, dst_tmp, convert_policy, id_src1, id_src2, id_dst);
 
-        dst = convert_to_asymmetric(dst_tmp, dst.quantization_info());
+        dst = convert_to_asymmetric<uint8_t>(dst_tmp, dst.quantization_info());
         return dst;
     }
     else
diff --git a/tests/validation/reference/Erode.cpp b/tests/validation/reference/Erode.cpp
index 5e8b36f..e7598c3 100644
--- a/tests/validation/reference/Erode.cpp
+++ b/tests/validation/reference/Erode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,8 @@
     */
     SimpleTensor<T> dst(src.shape(), src.data_type());
 
-    for(int i = 0; i < src.num_elements(); ++i)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         Coordinates coord = index2coord(src.shape(), i);
         const int   x     = coord.x();
diff --git a/tests/validation/reference/FastCorners.cpp b/tests/validation/reference/FastCorners.cpp
index bcea3f1..32b9115 100644
--- a/tests/validation/reference/FastCorners.cpp
+++ b/tests/validation/reference/FastCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -167,7 +167,8 @@
     SimpleTensor<uint8_t>    scores(src.shape(), DataType::U8);
     ValidRegion              valid_region = shape_to_valid_region(src.shape(), BorderMode::UNDEFINED == border_mode, BorderSize(bresenham_radius));
 
-    for(int i = 0; i < src.num_elements(); ++i)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         Coordinates candidate = index2coord(src.shape(), i);
         scores[i]             = 0;
diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp
index 2feab89..3c72b94 100644
--- a/tests/validation/reference/GEMM.cpp
+++ b/tests/validation/reference/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -84,8 +84,61 @@
     return dst;
 }
 
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+SimpleTensor<T> gemm_mixed_precision(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta)
+{
+    // GEMM mixed-precision combines F32 accumulators with F16 multiplications
+    // Create reference
+    SimpleTensor<T> dst{ c.shape(), c.data_type(), 1 };
+
+    // Compute reference
+    const int M = a.shape().y();
+    const int N = b.shape().x();
+    const int K = a.shape().x();
+    const int D = a.shape().z(); // Number of matrices in a batch
+    const int W = a.shape()[3];  // Number of batched-gemm (Winograd case)
+
+    const int a_stride_z = K * M;
+    const int a_stride_w = K * M * D;
+
+    const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
+    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    const int c_stride_z = N * M;
+    const int c_stride_w = N * M * D;
+
+    for(int w = 0; w < W; ++w)
+    {
+        for(int depth = 0; depth < D; ++depth)
+        {
+            const int base_addr_a = depth * a_stride_z + w * a_stride_w;
+            const int base_addr_b = depth * b_stride_z + w * b_stride_w;
+            const int base_addr_c = depth * c_stride_z + w * c_stride_w;
+
+            for(int row = 0; row < M; ++row)
+            {
+                for(int col = 0; col < N; ++col)
+                {
+                    float acc(0);
+
+                    for(int k = 0; k < K; ++k)
+                    {
+                        acc += static_cast<float>(a[base_addr_a + k + row * K] * b[base_addr_b + col + k * N]);
+                    }
+
+                    // Finalize the result: alpha * A * B + beta * C
+                    dst[base_addr_c + col + row * N] = static_cast<T>(alpha * acc + beta * c[base_addr_c + col + row * N]);
+                }
+            }
+        }
+    }
+
+    return dst;
+}
+
 template SimpleTensor<float> gemm(const SimpleTensor<float> &a, const SimpleTensor<float> &b, const SimpleTensor<float> &c, float alpha, float beta);
 template SimpleTensor<half> gemm(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
+template SimpleTensor<half> gemm_mixed_precision(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/GEMM.h b/tests/validation/reference/GEMM.h
index 39007c6..9bcd640 100644
--- a/tests/validation/reference/GEMM.h
+++ b/tests/validation/reference/GEMM.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,9 @@
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
 SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta);
 
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+SimpleTensor<T> gemm_mixed_precision(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const SimpleTensor<T> &c, float alpha, float beta);
+
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/GEMMLowp.cpp b/tests/validation/reference/GEMMLowp.cpp
index 97d0532..08be4a5 100644
--- a/tests/validation/reference/GEMMLowp.cpp
+++ b/tests/validation/reference/GEMMLowp.cpp
@@ -39,10 +39,11 @@
 namespace
 {
 template <typename T>
-void quantize_down_int32_to_uint8_scale(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
-                                        int32_t min, int32_t max)
+void quantize_down_int32_to_uint8_scale(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, int32_t result_offset, std::vector<int32_t> result_mult_int,
+                                        std::vector<int32_t> result_shift, int32_t min, int32_t max)
 {
-    const int cols_in = in->shape().x();
+    const int  cols_in        = in->shape().x();
+    const bool is_per_channel = result_mult_int.size() > 1;
 
     for(int i = 0; i < in->num_elements(); ++i)
     {
@@ -53,9 +54,9 @@
             result += (*bias)[i % cols_in];
         }
 
-        result *= result_mult_int;
+        result *= (is_per_channel) ? result_mult_int[i % cols_in] : result_mult_int[0];
 
-        result >>= result_shift;
+        result >>= (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
 
         // Bounded ReLu
         if(min != max)
@@ -68,10 +69,11 @@
 }
 
 template <typename T>
-void quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                      int32_t result_offset_after_shift, int32_t min, int32_t max)
+void quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                      std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
 {
-    const int cols_in = in->shape().x();
+    const int  cols_in        = in->shape().x();
+    const bool is_per_channel = result_fixedpoint_multiplier.size() > 1;
 
     for(int i = 0; i < in->num_elements(); ++i)
     {
@@ -83,7 +85,10 @@
         }
 
         // Fixed point multiplication
-        result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, result_fixedpoint_multiplier), result_shift);
+        const int32_t multiplier = (is_per_channel) ? result_fixedpoint_multiplier[i % cols_in] : result_fixedpoint_multiplier[0];
+        const int32_t shift      = (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
+
+        result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, multiplier), shift);
         result += result_offset_after_shift;
 
         // Bounded ReLu
@@ -112,7 +117,14 @@
         }
 
         // Fixed point multiplication
-        result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, result_fixedpoint_multiplier), result_shift);
+        if(result_shift < 0)
+        {
+            result = asymm_int_mult(result * (1 << (-result_shift)), result_fixedpoint_multiplier);
+        }
+        else
+        {
+            result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, result_fixedpoint_multiplier), result_shift);
+        }
 
         // Bounded ReLu
         if(min != max)
@@ -125,8 +137,8 @@
 }
 } // namespace
 
-template <typename T_out, typename T_in>
-SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, const SimpleTensor<T_in> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
+template <typename T_out, typename T_in, typename T_in_1>
+SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, const SimpleTensor<T_in_1> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
 {
     static_assert(std::is_same<typename std::decay<T_out>::type, int32_t>::value, "Only int32_t is allowed for the output");
 
@@ -179,14 +191,15 @@
 }
 
 // used to validate assembly kernels which don't know anything about offsets
-template <typename T1, typename T2>
-SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b, TensorShape shape_c)
+template <typename T1, typename T2, typename T3>
+SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c)
 {
-    return gemmlowp_matrix_multiply_core<T1, T2>(a, b, shape_c, 0, 0);
+    return gemmlowp_matrix_multiply_core<T1, T2, T3>(a, b, shape_c, 0, 0);
 }
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max)
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, std::vector<int32_t> result_mult_int, std::vector<int32_t> result_shift,
+                                                                  int32_t min, int32_t max)
 {
     SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
 
@@ -196,8 +209,8 @@
 }
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
-                                                                  int32_t min, int32_t max)
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, std::vector<int32_t> result_mult_int,
+                                                                  std::vector<int32_t> result_shift, int32_t min, int32_t max)
 {
     SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
 
@@ -207,9 +220,8 @@
 }
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                                                int32_t result_offset_after_shift, int32_t min,
-                                                                                int32_t max)
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, std::vector<int32_t> result_fixedpoint_multiplier, std::vector<int32_t> result_shift,
+                                                                                int32_t result_offset_after_shift, int32_t min, int32_t max)
 {
     SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
 
@@ -219,8 +231,8 @@
 }
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                                                int32_t result_offset_after_shift, int32_t min, int32_t max)
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                                std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
 {
     SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
 
@@ -251,22 +263,24 @@
     return dst;
 }
 
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                                                         int32_t result_offset_after_shift, int32_t min, int32_t max);
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_fixedpoint_multiplier,
-                                                                                         int32_t result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                                         std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
+                                                                                         std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                                         std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
 template SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, int32_t result_fixedpoint_multiplier, int32_t result_shift,
                                                                                          int32_t min, int32_t max);
 template SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_fixedpoint_multiplier,
                                                                                          int32_t result_shift, int32_t min, int32_t max);
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min,
-                                                                           int32_t max);
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, int32_t result_mult_int,
-                                                                           int32_t result_shift, int32_t min, int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, std::vector<int32_t> result_mult_int,
+                                                                           std::vector<int32_t> result_shift, int32_t min, int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, std::vector<int32_t> result_mult_int,
+                                                                           std::vector<int32_t> result_shift, int32_t min, int32_t max);
 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
-template SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
-template SimpleTensor<int32_t> gemmlowp(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c);
+template SimpleTensor<int32_t> gemmlowp<int32_t, int8_t, int8_t>(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
+template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, uint8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c);
+template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, int8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/GEMMLowp.h b/tests/validation/reference/GEMMLowp.h
index 5581f67..815527e 100644
--- a/tests/validation/reference/GEMMLowp.h
+++ b/tests/validation/reference/GEMMLowp.h
@@ -35,31 +35,32 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min = 0, int32_t max = 0);
-template <typename T1, typename T2>
-SimpleTensor<T1> gemmlowp_matrix_multiply_core(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
+template <typename T1, typename T2, typename T3>
+SimpleTensor<T1> gemmlowp_matrix_multiply_core(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
+
+template <typename T1, typename T2, typename T3 = T2>
+SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c);
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift);
-
-template <typename T1, typename T2>
-SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b, TensorShape shape_c);
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, std::vector<int32_t> result_mult_int, std::vector<int32_t> result_shift);
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, std::vector<int32_t> result_mult_int, std::vector<int32_t> result_shift,
                                                                   int32_t min = 0, int32_t max = 0);
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                                                int32_t result_offset_after_shift,
-                                                                                int32_t min = 0, int32_t max = 0);
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, std::vector<int32_t> result_mult_int,
+                                                                  std::vector<int32_t> result_shift, int32_t min = 0, int32_t max = 0);
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_fixedpoint_multiplier, int32_t result_shift,
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, std::vector<int32_t> result_fixedpoint_multiplier, std::vector<int32_t> result_shift,
                                                                                 int32_t result_offset_after_shift, int32_t min = 0, int32_t max = 0);
 
 template <typename T>
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                                std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min = 0, int32_t max = 0);
+
+template <typename T>
 SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<T> &in, int32_t result_fixedpoint_multiplier, int32_t result_shift,
                                                                                 int32_t min, int32_t max);
 template <typename T>
diff --git a/tests/validation/reference/Gaussian3x3.cpp b/tests/validation/reference/Gaussian3x3.cpp
index c71eade..5ca24a7 100644
--- a/tests/validation/reference/Gaussian3x3.cpp
+++ b/tests/validation/reference/Gaussian3x3.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,8 +39,10 @@
 {
     SimpleTensor<T> dst(src.shape(), src.data_type());
     const std::array<T, 9> filter{ { 1, 2, 1, 2, 4, 2, 1, 2, 1 } };
-    const float scale = 1.f / 16.f;
-    for(int element_idx = 0; element_idx < src.num_elements(); ++element_idx)
+    const float    scale        = 1.f / 16.f;
+    const uint32_t num_elements = src.num_elements();
+
+    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
     {
         const Coordinates id = index2coord(src.shape(), element_idx);
         apply_2d_spatial_filter(id, src, dst, TensorShape(3U, 3U), filter.data(), scale, border_mode, constant_border_value);
diff --git a/tests/validation/reference/Gaussian5x5.cpp b/tests/validation/reference/Gaussian5x5.cpp
index 55bb287..ac84f6d 100644
--- a/tests/validation/reference/Gaussian5x5.cpp
+++ b/tests/validation/reference/Gaussian5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,8 +45,10 @@
             4, 16, 24, 16, 4,
             1, 4, 6, 4, 1
         } };
-    const float scale = 1.f / 256.f;
-    for(int element_idx = 0; element_idx < src.num_elements(); ++element_idx)
+    const float    scale        = 1.f / 256.f;
+    const uint32_t num_elements = src.num_elements();
+
+    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
     {
         const Coordinates id = index2coord(src.shape(), element_idx);
         apply_2d_spatial_filter(id, src, dst, TensorShape(5U, 5U), filter.data(), scale, border_mode, constant_border_value);
diff --git a/tests/validation/reference/HOGDescriptor.cpp b/tests/validation/reference/HOGDescriptor.cpp
index ed22695..f0f573a 100644
--- a/tests/validation/reference/HOGDescriptor.cpp
+++ b/tests/validation/reference/HOGDescriptor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,7 +69,7 @@
 }
 
 template <typename T>
-void hog_block_normalization_compute(SimpleTensor<T> &block, SimpleTensor<T> &desc, const HOGInfo &hog_info, size_t block_idx)
+void hog_block_normalization_compute(SimpleTensor<T> &block, SimpleTensor<T> &desc, const HOGInfo &hog_info, uint32_t block_idx)
 {
     const int         num_bins_per_block = desc.num_channels();
     const HOGNormType norm_type          = hog_info.normalization_type();
@@ -186,8 +186,8 @@
     // Tensor representing single block
     SimpleTensor<T> block(TensorShape{ 1u, 1u }, DataType::F32, cells_per_block.area() * num_bins);
 
-    int block_idx      = 0;
-    int block_y_offset = 0;
+    uint32_t block_idx      = 0;
+    int      block_y_offset = 0;
 
     // Traverse shape
     for(auto sy = block_size.height; sy <= shape_height; sy += block_stride.height)
diff --git a/tests/validation/reference/InstanceNormalizationLayer.cpp b/tests/validation/reference/InstanceNormalizationLayer.cpp
new file mode 100644
index 0000000..ad0ac1b
--- /dev/null
+++ b/tests/validation/reference/InstanceNormalizationLayer.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "InstanceNormalizationLayer.h"
+
+#include "tests/validation/Helpers.h"
+
+#include <algorithm>
+#include <cmath>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> instance_normalization(const SimpleTensor<T> &src, float gamma, float beta, float epsilon)
+{
+    SimpleTensor<T> dst{ src.shape(), src.data_type() };
+
+    //NCHW
+    const size_t w_size = src.shape()[0];
+    const size_t h_size = src.shape()[1];
+    const size_t c_size = src.shape()[2];
+    const size_t n_size = src.shape()[3];
+
+    for(size_t n_i = 0; n_i < n_size; ++n_i)
+    {
+        for(size_t c_i = 0; c_i < c_size; ++c_i)
+        {
+            float sum_h_w = 0;
+            float sum_sq_h_w = 0;
+
+            for(size_t h_i = 0; h_i < h_size; ++h_i)
+            {
+                for(size_t w_i = 0; w_i < w_size; ++w_i)
+                {
+                    float val = src[coord2index(src.shape(), Coordinates(w_i, h_i, c_i, n_i))];
+                    sum_h_w += val;
+                    sum_sq_h_w += val*val;
+                }
+            }
+            //Compute mean
+            const float mean_h_w = sum_h_w / (h_size * w_size);
+            //Compute variance
+            const float var_h_w = sum_sq_h_w / (h_size * w_size) - mean_h_w * mean_h_w;;
+
+            //Apply mean
+            for(size_t h_i = 0; h_i < h_size; ++h_i)
+            {
+                for(size_t w_i = 0; w_i < w_size; ++w_i)
+                {
+                    //Compute output
+                    size_t index = coord2index(src.shape(), Coordinates(w_i, h_i, c_i, n_i));
+                    dst[index]   = (src[index] - mean_h_w) * gamma / std::sqrt(var_h_w + epsilon) + beta;
+                }
+            }
+        }
+    }
+    return dst;
+}
+
+template SimpleTensor<float> instance_normalization(const SimpleTensor<float> &src, float gamma, float beta, float epsilon);
+template SimpleTensor<half> instance_normalization(const SimpleTensor<half> &src, float gamma, float beta, float epsilon);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/InstanceNormalizationLayer.h b/tests/validation/reference/InstanceNormalizationLayer.h
new file mode 100644
index 0000000..2926e09
--- /dev/null
+++ b/tests/validation/reference/InstanceNormalizationLayer.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_INSTANCENORMALIZATION_H__
+#define __ARM_COMPUTE_TEST_INSTANCENORMALIZATION_H__
+
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> instance_normalization(const SimpleTensor<T> &src, float gamma, float beta, float epsilon);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_INSTANCENORMALIZATION_H__ */
diff --git a/tests/validation/reference/LogSoftmaxLayer.cpp b/tests/validation/reference/LogSoftmaxLayer.cpp
new file mode 100644
index 0000000..3f21d85
--- /dev/null
+++ b/tests/validation/reference/LogSoftmaxLayer.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "LogSoftmaxLayer.h"
+#include "SoftmaxLayer.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
+{
+    return softmax_layer_generic<T>(src, beta, axis, true);
+}
+
+template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type>
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
+{
+    // Note: Output quantization info should always have scale = 1/256 and offset = 0
+    const QuantizationInfo output_quantization_info = QuantizationInfo(1.f / 256, 0);
+
+    SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float> dst_tmp = log_softmax_layer<float>(src_tmp, beta, axis);
+    SimpleTensor<T>     dst     = convert_to_asymmetric<uint8_t>(dst_tmp, output_quantization_info);
+    return dst;
+}
+
+template SimpleTensor<float> log_softmax_layer(const SimpleTensor<float> &src, float beta, size_t axis);
+template SimpleTensor<half> log_softmax_layer(const SimpleTensor<half> &src, float beta, size_t axis);
+template SimpleTensor<uint8_t> log_softmax_layer(const SimpleTensor<uint8_t> &src, float beta, size_t axis);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/LogSoftmaxLayer.h b/tests/validation/reference/LogSoftmaxLayer.h
new file mode 100644
index 0000000..35547ca
--- /dev/null
+++ b/tests/validation/reference/LogSoftmaxLayer.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_LOG_SOFTMAX_LAYER_H__
+#define __ARM_COMPUTE_TEST_LOG_SOFTMAX_LAYER_H__
+
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis = 1);
+
+template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type = 0>
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis = 1);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_SOFTMAX_LAYER_H__ */
diff --git a/tests/validation/reference/Median3x3.cpp b/tests/validation/reference/Median3x3.cpp
index 91a787a..314bbe3 100644
--- a/tests/validation/reference/Median3x3.cpp
+++ b/tests/validation/reference/Median3x3.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,8 +45,9 @@
 {
     SimpleTensor<T> dst(src.shape(), src.data_type());
     const int       size_tot_filter = filter_size * filter_size;
+    const uint32_t  num_elements    = src.num_elements();
 
-    for(int src_idx = 0; src_idx < src.num_elements(); ++src_idx)
+    for(uint32_t src_idx = 0; src_idx < num_elements; ++src_idx)
     {
         std::array<T, size_tot_filter> filter_elems = { { 0 } };
         Coordinates id = index2coord(src.shape(), src_idx);
diff --git a/tests/validation/reference/NonLinearFilter.cpp b/tests/validation/reference/NonLinearFilter.cpp
index 8669c9c..72433eb 100644
--- a/tests/validation/reference/NonLinearFilter.cpp
+++ b/tests/validation/reference/NonLinearFilter.cpp
@@ -1,24 +1,24 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
- * deal src the Software without restriction, including without limitation the
+ * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included src all
+ * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. src NO EVENT SHALL THE
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER src AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * dst OF OR src CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "NonLinearFilter.h"
@@ -49,8 +49,9 @@
     intermediate_type              current_value = 0;
 
     const ValidRegion valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(half_mask_size));
+    const uint32_t    num_elements = src.num_elements();
 
-    for(int element_idx = 0, count = 0, index = 0; element_idx < src.num_elements(); ++element_idx, count = 0, index = 0)
+    for(uint32_t element_idx = 0, count = 0, index = 0; element_idx < num_elements; ++element_idx, count = 0, index = 0)
     {
         Coordinates id = index2coord(src.shape(), element_idx);
         if(is_in_valid_region(valid_region, id))
diff --git a/tests/validation/reference/NonMaximaSuppression.cpp b/tests/validation/reference/NonMaximaSuppression.cpp
index 34c6c07..45ce67f 100644
--- a/tests/validation/reference/NonMaximaSuppression.cpp
+++ b/tests/validation/reference/NonMaximaSuppression.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,8 @@
     SimpleTensor<T> dst(src.shape(), src.data_type(), src.num_channels());
     ValidRegion     valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(block_size / 2));
 
-    for(int i = 0; i < src.num_elements(); ++i)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         Coordinates coord = index2coord(src.shape(), i);
         int         x     = coord.x();
diff --git a/tests/validation/reference/NormalizePlanarYUVLayer.cpp b/tests/validation/reference/NormalizePlanarYUVLayer.cpp
index 563e2a7..ea0e75a 100644
--- a/tests/validation/reference/NormalizePlanarYUVLayer.cpp
+++ b/tests/validation/reference/NormalizePlanarYUVLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,7 +68,7 @@
     SimpleTensor<float>   mean_tmp = convert_from_asymmetric(mean);
     SimpleTensor<float>   std_tmp  = convert_from_asymmetric(std);
     SimpleTensor<float>   dst_tmp  = normalize_planar_yuv_layer<float>(src_tmp, mean_tmp, std_tmp);
-    SimpleTensor<uint8_t> dst      = convert_to_asymmetric(dst_tmp, src.quantization_info());
+    SimpleTensor<uint8_t> dst      = convert_to_asymmetric<uint8_t>(dst_tmp, src.quantization_info());
     return dst;
 }
 
diff --git a/tests/validation/reference/PadLayer.cpp b/tests/validation/reference/PadLayer.cpp
index d072bc5..182c16f 100644
--- a/tests/validation/reference/PadLayer.cpp
+++ b/tests/validation/reference/PadLayer.cpp
@@ -54,7 +54,8 @@
     SimpleTensor<T> dst(padded_shape, dst_data_type);
 
     // Reference algorithm: loop over the different dimension of the input.
-    for(int idx = 0; idx < dst.num_elements(); ++idx)
+    const uint32_t num_elements = dst.num_elements();
+    for(uint32_t idx = 0; idx < num_elements; ++idx)
     {
         const Coordinates coord = index2coord(padded_shape, idx);
 
@@ -115,11 +116,11 @@
                 case PaddingMode::REFLECT:
                 {
                     const Coordinates orig_coords{ orig_coord_reflect(0),
-                                             orig_coord_reflect(1),
-                                             orig_coord_reflect(2),
-                                             orig_coord_reflect(3),
-                                             orig_coord_reflect(4),
-                                             orig_coord_reflect(5) };
+                              orig_coord_reflect(1),
+                              orig_coord_reflect(2),
+                              orig_coord_reflect(3),
+                              orig_coord_reflect(4),
+                              orig_coord_reflect(5) };
 
                     const size_t idx_src = coord2index(orig_shape, orig_coords);
                     dst[idx]             = src[idx_src];
@@ -128,11 +129,11 @@
                 case PaddingMode::SYMMETRIC:
                 {
                     const Coordinates orig_coords{ orig_coord_symm(0),
-                                             orig_coord_symm(1),
-                                             orig_coord_symm(2),
-                                             orig_coord_symm(3),
-                                             orig_coord_symm(4),
-                                             orig_coord_symm(5) };
+                              orig_coord_symm(1),
+                              orig_coord_symm(2),
+                              orig_coord_symm(3),
+                              orig_coord_symm(4),
+                              orig_coord_symm(5) };
 
                     const size_t idx_src = coord2index(orig_shape, orig_coords);
                     dst[idx]             = src[idx_src];
@@ -147,12 +148,15 @@
         {
             // If the tuple[i,j,k,l,m] is not in the padding area, then copy the input into the output
 
-            const Coordinates orig_coords{ i - paddings_extended[0].first,
-                                     j - paddings_extended[1].first,
-                                     k - paddings_extended[2].first,
-                                     l - paddings_extended[3].first,
-                                     m - paddings_extended[4].first,
-                                     n - paddings_extended[5].first };
+            const Coordinates orig_coords
+            {
+                i - paddings_extended[0].first,
+                j - paddings_extended[1].first,
+                k - paddings_extended[2].first,
+                l - paddings_extended[3].first,
+                m - paddings_extended[4].first,
+                n - paddings_extended[5].first
+            };
 
             const size_t idx_src = coord2index(orig_shape, orig_coords);
             dst[idx]             = src[idx_src];
@@ -164,11 +168,12 @@
 
 template SimpleTensor<float> pad_layer(const SimpleTensor<float> &src, const PaddingList &paddings, const PixelValue const_value = PixelValue(), const PaddingMode mode);
 template SimpleTensor<half> pad_layer(const SimpleTensor<half> &src, const PaddingList &paddings, const PixelValue const_value = PixelValue(), const PaddingMode mode);
-template SimpleTensor<uint32_t> pad_layer(const SimpleTensor<uint32_t> &src, const PaddingList &paddings, const PixelValue const_value = PixelValue(), const PaddingMode mode);
 template SimpleTensor<uint8_t> pad_layer(const SimpleTensor<uint8_t> &src, const PaddingList &paddings, const PixelValue const_value = PixelValue(), const PaddingMode mode);
 template SimpleTensor<int8_t> pad_layer(const SimpleTensor<int8_t> &src, const PaddingList &paddings, const PixelValue const_value = PixelValue(), const PaddingMode mode);
 template SimpleTensor<uint16_t> pad_layer(const SimpleTensor<uint16_t> &src, const PaddingList &paddings, const PixelValue const_value = PixelValue(), const PaddingMode mode);
 template SimpleTensor<int16_t> pad_layer(const SimpleTensor<int16_t> &src, const PaddingList &paddings, const PixelValue const_value = PixelValue(), const PaddingMode mode);
+template SimpleTensor<uint32_t> pad_layer(const SimpleTensor<uint32_t> &src, const PaddingList &paddings, const PixelValue const_value = PixelValue(), const PaddingMode mode);
+template SimpleTensor<int32_t> pad_layer(const SimpleTensor<int32_t> &src, const PaddingList &paddings, const PixelValue const_value = PixelValue(), const PaddingMode mode);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Permute.cpp b/tests/validation/reference/Permute.cpp
index 619a787..36b07dc 100644
--- a/tests/validation/reference/Permute.cpp
+++ b/tests/validation/reference/Permute.cpp
@@ -45,7 +45,8 @@
     SimpleTensor<T> dst{ dst_shape, src.data_type(), src.num_channels(), src.quantization_info() };
 
     // Compute reference
-    for(int i = 0; i < src.num_elements(); ++i)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         const Coordinates src_coords = index2coord(src.shape(), i);
         Coordinates       dst_coords = src_coords;
diff --git a/tests/validation/reference/PixelWiseMultiplication.cpp b/tests/validation/reference/PixelWiseMultiplication.cpp
index 41a9192..d9895e5 100644
--- a/tests/validation/reference/PixelWiseMultiplication.cpp
+++ b/tests/validation/reference/PixelWiseMultiplication.cpp
@@ -160,7 +160,7 @@
         SimpleTensor<float> src1_tmp = convert_from_asymmetric(src1);
         SimpleTensor<float> src2_tmp = convert_from_asymmetric(src2);
         SimpleTensor<float> dst_tmp  = pixel_wise_multiplication<float>(src1_tmp, src2_tmp, scale, convert_policy, rounding_policy, qout);
-        dst                          = convert_to_asymmetric(dst_tmp, qout);
+        dst                          = convert_to_asymmetric<uint8_t>(dst_tmp, qout);
     }
     else
     {
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index f4112a4..010412c 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -37,8 +37,8 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-template <typename T>
-SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
+template <typename T, typename ACC_T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
 {
     ARM_COMPUTE_UNUSED(output_qinfo); // requantization occurs in pooling_layer<uint8_t>
     ARM_COMPUTE_ERROR_ON(info.is_global_pooling() && (src.shape().x() != src.shape().y()));
@@ -79,12 +79,12 @@
                     wstart     = std::max(wstart, 0);
                     hstart     = std::max(hstart, 0);
 
-                    T max_val = std::numeric_limits<T>::lowest();
+                    auto max_val = std::numeric_limits<ACC_T>::lowest();
                     for(int y = hstart; y < hend; ++y)
                     {
                         for(int x = wstart; x < wend; ++x)
                         {
-                            const T val = src[r * h_src * w_src + y * w_src + x];
+                            const auto val = static_cast<ACC_T>(src[r * h_src * w_src + y * w_src + x]);
                             if(val > max_val)
                             {
                                 max_val = val;
@@ -92,7 +92,7 @@
                         }
                     }
 
-                    dst[r * h_dst * w_dst + h * w_dst + w] = max_val;
+                    dst[r * h_dst * w_dst + h * w_dst + w] = static_cast<T>(max_val);
                 }
             }
         }
@@ -105,16 +105,16 @@
             {
                 for(int w = 0; w < w_dst; ++w)
                 {
-                    T   avg_val(0);
-                    int wstart = w * pool_stride_x - pad_left;
-                    int hstart = h * pool_stride_y - pad_top;
-                    int wend   = std::min(wstart + pool_size_x, w_src + pad_right);
-                    int hend   = std::min(hstart + pool_size_y, h_src + pad_bottom);
-                    int pool   = (hend - hstart) * (wend - wstart);
-                    wstart     = std::max(wstart, 0);
-                    hstart     = std::max(hstart, 0);
-                    wend       = std::min(wend, w_src);
-                    hend       = std::min(hend, h_src);
+                    ACC_T avg_val(0);
+                    int   wstart = w * pool_stride_x - pad_left;
+                    int   hstart = h * pool_stride_y - pad_top;
+                    int   wend   = std::min(wstart + pool_size_x, w_src + pad_right);
+                    int   hend   = std::min(hstart + pool_size_y, h_src + pad_bottom);
+                    int   pool   = (hend - hstart) * (wend - wstart);
+                    wstart       = std::max(wstart, 0);
+                    hstart       = std::max(hstart, 0);
+                    wend         = std::min(wend, w_src);
+                    hend         = std::min(hend, h_src);
                     // Exclude padding pixels from the average
                     if(exclude_padding)
                     {
@@ -127,7 +127,7 @@
                         {
                             for(int x = wstart; x < wend; ++x)
                             {
-                                avg_val += src[r * h_src * w_src + y * w_src + x];
+                                avg_val += static_cast<ACC_T>(src[r * h_src * w_src + y * w_src + x]);
                             }
                         }
                         dst[r * h_dst * w_dst + h * w_dst + w] = avg_val / pool;
@@ -138,11 +138,11 @@
                         {
                             for(int x = wstart; x < wend; ++x)
                             {
-                                const T val = src[r * h_src * w_src + y * w_src + x];
+                                const auto val = static_cast<ACC_T>(src[r * h_src * w_src + y * w_src + x]);
                                 avg_val += val * val;
                             }
                         }
-                        dst[r * h_dst * w_dst + h * w_dst + w] = std::sqrt(avg_val / pool);
+                        dst[r * h_dst * w_dst + h * w_dst + w] = static_cast<T>(std::sqrt(avg_val / pool));
                     }
                 }
             }
@@ -152,17 +152,37 @@
     return dst;
 }
 
+template SimpleTensor<float> pooling_layer_internal<float>(const SimpleTensor<float> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
+template SimpleTensor<half> pooling_layer_internal<half>(const SimpleTensor<half> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
+template SimpleTensor<half> pooling_layer_internal<half, float>(const SimpleTensor<half> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
+
+template <typename T>
+SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
+{
+    return pooling_layer_internal<T, T>(src, info, output_qinfo);
+}
+
 template <>
 SimpleTensor<uint8_t> pooling_layer<uint8_t>(const SimpleTensor<uint8_t> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
 {
     SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float>   dst_tmp = pooling_layer<float>(src_tmp, info, output_qinfo);
-    SimpleTensor<uint8_t> dst     = convert_to_asymmetric(dst_tmp, output_qinfo);
+    SimpleTensor<float>   dst_tmp = pooling_layer_internal<float>(src_tmp, info, output_qinfo);
+    SimpleTensor<uint8_t> dst     = convert_to_asymmetric<uint8_t>(dst_tmp, output_qinfo);
     return dst;
 }
 
+template <>
+SimpleTensor<half> pooling_layer(const SimpleTensor<half> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo)
+{
+    if(src.data_type() == DataType::F16 && info.fp_mixed_precision())
+    {
+        return pooling_layer_internal<half, float>(src, info, output_qinfo);
+    }
+
+    return pooling_layer_internal<half>(src, info, output_qinfo);
+}
+
 template SimpleTensor<float> pooling_layer(const SimpleTensor<float> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
-template SimpleTensor<half> pooling_layer(const SimpleTensor<half> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/PoolingLayer.h b/tests/validation/reference/PoolingLayer.h
index 1c0b7ff..fc36d51 100644
--- a/tests/validation/reference/PoolingLayer.h
+++ b/tests/validation/reference/PoolingLayer.h
@@ -35,6 +35,8 @@
 {
 namespace reference
 {
+template <typename T, typename ACC_T = T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
 template <typename T>
 SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo);
 } // namespace reference
diff --git a/tests/validation/reference/QuantizationLayer.cpp b/tests/validation/reference/QuantizationLayer.cpp
index 182585a..ae23f7e 100644
--- a/tests/validation/reference/QuantizationLayer.cpp
+++ b/tests/validation/reference/QuantizationLayer.cpp
@@ -33,26 +33,45 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<T> &src, const QuantizationInfo &quantization_info)
+template <typename Tin, typename Tout>
+SimpleTensor<Tout> quantization_layer(const SimpleTensor<Tin> &src, DataType output_data_type, const QuantizationInfo &quantization_info)
 {
     // Create reference
-    SimpleTensor<uint8_t> dst{ src.shape(), DataType::QASYMM8, 1, quantization_info };
+    SimpleTensor<Tout> dst{ src.shape(), output_data_type, 1, quantization_info };
 
     const UniformQuantizationInfo qinfo = quantization_info.uniform();
-    for(int i = 0; i < src.num_elements(); ++i)
+    switch(output_data_type)
     {
+        case DataType::QASYMM8:
+            for(int i = 0; i < src.num_elements(); ++i)
+            {
 #ifdef __aarch64__
-        dst[i] = quantize_qasymm8((src[i]), qinfo, RoundingPolicy::TO_NEAREST_EVEN);
+                dst[i] = quantize_qasymm8((src[i]), qinfo, RoundingPolicy::TO_NEAREST_EVEN);
 #else  // __aarch64__
-        dst[i] = quantize_qasymm8((src[i]), qinfo, RoundingPolicy::TO_ZERO);
+                dst[i] = quantize_qasymm8((src[i]), qinfo, RoundingPolicy::TO_ZERO);
 #endif // __aarch64__
+            }
+            break;
+        case DataType::QASYMM16:
+            for(int i = 0; i < src.num_elements(); ++i)
+            {
+#ifdef __aarch64__
+                dst[i] = quantize_qasymm16((src[i]), qinfo, RoundingPolicy::TO_NEAREST_EVEN);
+#else  // __aarch64__
+                dst[i] = quantize_qasymm16((src[i]), qinfo, RoundingPolicy::TO_ZERO);
+#endif // __aarch64__
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported output data type");
     }
     return dst;
 }
 
-template SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<half> &src, const QuantizationInfo &quantization_info);
-template SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<float> &src, const QuantizationInfo &quantization_info);
+template SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<half> &src, DataType output_data_type, const QuantizationInfo &quantization_info);
+template SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<float> &src, DataType output_data_type, const QuantizationInfo &quantization_info);
+template SimpleTensor<uint16_t> quantization_layer(const SimpleTensor<half> &src, DataType output_data_type, const QuantizationInfo &quantization_info);
+template SimpleTensor<uint16_t> quantization_layer(const SimpleTensor<float> &src, DataType output_data_type, const QuantizationInfo &quantization_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/QuantizationLayer.h b/tests/validation/reference/QuantizationLayer.h
index 462396f..0e80b49 100644
--- a/tests/validation/reference/QuantizationLayer.h
+++ b/tests/validation/reference/QuantizationLayer.h
@@ -35,8 +35,8 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<T> &src, const QuantizationInfo &quantization_info);
+template <typename Tin, typename Tout>
+SimpleTensor<Tout> quantization_layer(const SimpleTensor<Tin> &src, DataType output_data_type, const QuantizationInfo &quantization_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ROIAlignLayer.cpp b/tests/validation/reference/ROIAlignLayer.cpp
index 8a76983..c32dce7 100644
--- a/tests/validation/reference/ROIAlignLayer.cpp
+++ b/tests/validation/reference/ROIAlignLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -112,15 +112,33 @@
 {
     return std::max(lower, std::min(value, upper));
 }
-} // namespace
-template <typename T>
-SimpleTensor<T> roi_align_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &rois, const ROIPoolingLayerInfo &pool_info)
+
+SimpleTensor<float> convert_rois_from_asymmetric(SimpleTensor<uint16_t> rois)
 {
+    const UniformQuantizationInfo &quantization_info = rois.quantization_info().uniform();
+    SimpleTensor<float>            dst{ rois.shape(), DataType::F32, 1, QuantizationInfo(), rois.data_layout() };
+
+    for(int i = 0; i < rois.num_elements(); i += 5)
+    {
+        dst[i]     = static_cast<float>(rois[i]); // batch idx
+        dst[i + 1] = dequantize_qasymm16(rois[i + 1], quantization_info);
+        dst[i + 2] = dequantize_qasymm16(rois[i + 2], quantization_info);
+        dst[i + 3] = dequantize_qasymm16(rois[i + 3], quantization_info);
+        dst[i + 4] = dequantize_qasymm16(rois[i + 4], quantization_info);
+    }
+    return dst;
+}
+} // namespace
+template <typename T, typename TRois>
+SimpleTensor<T> roi_align_layer(const SimpleTensor<T> &src, const SimpleTensor<TRois> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo)
+{
+    ARM_COMPUTE_UNUSED(output_qinfo);
+
     const size_t values_per_roi = rois.shape()[0];
     const size_t num_rois       = rois.shape()[1];
     DataType     dst_data_type  = src.data_type();
 
-    const auto *rois_ptr = static_cast<const T *>(rois.data());
+    const auto *rois_ptr = static_cast<const TRois *>(rois.data());
 
     TensorShape     input_shape = src.shape();
     TensorShape     output_shape(pool_info.pooled_width(), pool_info.pooled_height(), src.shape()[2], num_rois);
@@ -183,8 +201,19 @@
     }
     return dst;
 }
-template SimpleTensor<float> roi_align_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &rois, const ROIPoolingLayerInfo &pool_info);
-template SimpleTensor<half> roi_align_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &rois, const ROIPoolingLayerInfo &pool_info);
+
+template SimpleTensor<float> roi_align_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo);
+template SimpleTensor<half> roi_align_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo);
+
+template <>
+SimpleTensor<uint8_t> roi_align_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint16_t> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo)
+{
+    SimpleTensor<float>   src_tmp  = convert_from_asymmetric(src);
+    SimpleTensor<float>   rois_tmp = convert_rois_from_asymmetric(rois);
+    SimpleTensor<float>   dst_tmp  = roi_align_layer<float, float>(src_tmp, rois_tmp, pool_info, output_qinfo);
+    SimpleTensor<uint8_t> dst      = convert_to_asymmetric<uint8_t>(dst_tmp, output_qinfo);
+    return dst;
+}
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/ROIAlignLayer.h b/tests/validation/reference/ROIAlignLayer.h
index b67ff42..e156813 100644
--- a/tests/validation/reference/ROIAlignLayer.h
+++ b/tests/validation/reference/ROIAlignLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,8 +36,8 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<T> roi_align_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &rois, const ROIPoolingLayerInfo &pool_info);
+template <typename T, typename TRois>
+SimpleTensor<T> roi_align_layer(const SimpleTensor<T> &src, const SimpleTensor<TRois> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Range.cpp b/tests/validation/reference/Range.cpp
index c24512f..ad13454 100644
--- a/tests/validation/reference/Range.cpp
+++ b/tests/validation/reference/Range.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,7 +60,7 @@
     {
         SimpleTensor<float> dst_tmp{ dst.shape(), DataType::F32, 1 };
         generate_range(dst_tmp, start, num_of_elements, step);
-        return convert_to_asymmetric(dst_tmp, dst.quantization_info());
+        return convert_to_asymmetric<uint8_t>(dst_tmp, dst.quantization_info());
     }
     generate_range(dst, start, num_of_elements, step);
     return dst;
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index fe128cc..330a3b8 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -179,7 +179,7 @@
 {
     // Create reference
     const bool         is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
-    DataType           output_data_type = is_arg_min_max ? DataType::U32 : src.data_type();
+    DataType           output_data_type = is_arg_min_max ? DataType::S32 : src.data_type();
     SimpleTensor<OT>   dst{ dst_shape, output_data_type, 1, src.quantization_info() };
     const unsigned int src_width    = src.shape().x();
     const unsigned int src_height   = src.shape().y();
@@ -281,7 +281,7 @@
     {
         SimpleTensor<float> src_f = convert_from_asymmetric(src);
         SimpleTensor<float> dst_f = reference::reduction_operation<float, float>(src_f, dst_shape, axis, op);
-        return convert_to_asymmetric(dst_f, src.quantization_info());
+        return convert_to_asymmetric<uint8_t>(dst_f, src.quantization_info());
     }
     else
     {
@@ -292,10 +292,10 @@
 template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 
-template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
-template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<int32_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
-template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
-template SimpleTensor<uint32_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int32_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
 
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/Remap.cpp b/tests/validation/reference/Remap.cpp
index f862c13..a7352eb 100644
--- a/tests/validation/reference/Remap.cpp
+++ b/tests/validation/reference/Remap.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,9 +44,10 @@
     ARM_COMPUTE_ERROR_ON_MSG(border_mode == BorderMode::REPLICATE, "BorderMode not supported");
     SimpleTensor<T> out(in.shape(), in.data_type());
     ARM_COMPUTE_ERROR_ON(out.num_elements() != map_x.num_elements());
-    const int width  = in.shape().x();
-    const int height = in.shape().y();
-    for(int idx = 0; idx < out.num_elements(); idx++)
+    const int      width        = in.shape().x();
+    const int      height       = in.shape().y();
+    const uint32_t num_elements = out.num_elements();
+    for(uint32_t idx = 0; idx < num_elements; idx++)
     {
         const Coordinates id_out = index2coord(out.shape(), idx);
         valid_mask[idx]          = 1;
diff --git a/tests/validation/reference/Reverse.cpp b/tests/validation/reference/Reverse.cpp
index 1662dc2..4bd8efc 100644
--- a/tests/validation/reference/Reverse.cpp
+++ b/tests/validation/reference/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,8 @@
         to_reverse[axis[i]] = true;
     }
 
-    for(int i = 0; i < src.num_elements(); ++i)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         const Coordinates  src_coord = index2coord(src.shape(), i);
         const unsigned int dst_x     = to_reverse[0] ? width - src_coord[0] - 1 : src_coord[0];
diff --git a/tests/validation/reference/Scale.cpp b/tests/validation/reference/Scale.cpp
index 63a2853..7962459 100644
--- a/tests/validation/reference/Scale.cpp
+++ b/tests/validation/reference/Scale.cpp
@@ -63,7 +63,8 @@
         policy = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
 
-    for(int element_idx = 0, count = 0; element_idx < out.num_elements(); ++element_idx, ++count)
+    const uint32_t num_elements = out.num_elements();
+    for(uint32_t element_idx = 0, count = 0; element_idx < num_elements; ++element_idx, ++count)
     {
         Coordinates id    = index2coord(out.shape(), element_idx);
         int         idx   = id.x();
@@ -196,7 +197,7 @@
         SimpleTensor<float> src_tmp                 = convert_from_asymmetric(src);
         float               constant_border_value_f = dequantize_qasymm8(constant_border_value, src.quantization_info());
         SimpleTensor<float> dst_tmp                 = scale_core<float>(src_tmp, scale_x, scale_y, policy, border_mode, constant_border_value_f, sampling_policy, ceil_policy_scale);
-        dst                                         = convert_to_asymmetric(dst_tmp, src.quantization_info());
+        dst                                         = convert_to_asymmetric<uint8_t>(dst_tmp, src.quantization_info());
     }
     else
     {
diff --git a/tests/validation/reference/Scharr.cpp b/tests/validation/reference/Scharr.cpp
index 98e4d62..060192b 100644
--- a/tests/validation/reference/Scharr.cpp
+++ b/tests/validation/reference/Scharr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,7 +68,8 @@
 
     ValidRegion valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(filter_size / 2));
 
-    for(int i = 0; i < src.num_elements(); ++i)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         Coordinates coord = index2coord(src.shape(), i);
 
diff --git a/tests/validation/reference/Sobel.cpp b/tests/validation/reference/Sobel.cpp
index 233f1ad..1f35717 100644
--- a/tests/validation/reference/Sobel.cpp
+++ b/tests/validation/reference/Sobel.cpp
@@ -110,7 +110,8 @@
 
     ValidRegion valid_region = shape_to_valid_region(src.shape(), border_mode == BorderMode::UNDEFINED, BorderSize(filter_size / 2));
 
-    for(int i = 0; i < src.num_elements(); ++i)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         Coordinates coord = index2coord(src.shape(), i);
 
diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp
index f1b94c0..ef2468d 100644
--- a/tests/validation/reference/SoftmaxLayer.cpp
+++ b/tests/validation/reference/SoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
+SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, size_t axis, bool is_log)
 {
     // Create reference
     SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
@@ -65,23 +65,48 @@
 
         // Regularize
         T sum(0.f);
-        std::transform(src_row_ptr, src_row_ptr + lower_dims, dst_row_ptr, [&sum, max, beta](T val)
+        std::transform(src_row_ptr, src_row_ptr + lower_dims, dst_row_ptr, [&sum, max, beta, is_log](T val)
         {
-            const T res(std::exp((val - max) * beta));
-            sum += res;
+            T res{ (val - max) *beta };
+
+            if(is_log)
+            {
+                sum += std::exp(res);
+            }
+            else
+            {
+                res = std::exp(res);
+                sum += res;
+            }
             return res;
         });
 
         // Normalize
-        std::transform(dst_row_ptr, dst_row_ptr + lower_dims, dst_row_ptr, [sum](T val)
+        std::transform(dst_row_ptr, dst_row_ptr + lower_dims, dst_row_ptr, [sum, is_log](T val)
         {
-            return val / sum;
+            if(is_log)
+            {
+                return val - sum;
+            }
+            else
+            {
+                return val / sum;
+            }
         });
     }
 
     return dst;
 }
 
+template SimpleTensor<float> softmax_layer_generic(const SimpleTensor<float> &src, float beta, size_t axis, bool is_log);
+template SimpleTensor<half> softmax_layer_generic(const SimpleTensor<half> &src, float beta, size_t axis, bool is_log);
+
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
+{
+    return softmax_layer_generic<T>(src, beta, axis, false);
+}
+
 template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type>
 SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
 {
@@ -90,7 +115,7 @@
 
     SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
     SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta, axis);
-    SimpleTensor<T>     dst     = convert_to_asymmetric(dst_tmp, output_quantization_info);
+    SimpleTensor<T>     dst     = convert_to_asymmetric<uint8_t>(dst_tmp, output_quantization_info);
     return dst;
 }
 
diff --git a/tests/validation/reference/SoftmaxLayer.h b/tests/validation/reference/SoftmaxLayer.h
index d21ca2b..fa9485c 100644
--- a/tests/validation/reference/SoftmaxLayer.h
+++ b/tests/validation/reference/SoftmaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,9 @@
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, size_t axis, bool is_log = false);
+
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
 SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis = 1);
 
 template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type = 0>
diff --git a/tests/validation/reference/Tile.cpp b/tests/validation/reference/Tile.cpp
index e87e515..694f645 100644
--- a/tests/validation/reference/Tile.cpp
+++ b/tests/validation/reference/Tile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,8 @@
 
     SimpleTensor<T> dst{ tiled_shape, src.data_type() };
 
-    for(int idx = 0; idx < dst.num_elements(); idx++)
+    const uint32_t num_elements = dst.num_elements();
+    for(uint32_t idx = 0; idx < num_elements; idx++)
     {
         Coordinates coord = index2coord(tiled_shape, idx);
 
diff --git a/tests/validation/reference/Transpose.cpp b/tests/validation/reference/Transpose.cpp
index 348c703..a8c0e95 100644
--- a/tests/validation/reference/Transpose.cpp
+++ b/tests/validation/reference/Transpose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,8 @@
     SimpleTensor<T> dst{ dst_shape, src.data_type() };
 
     // Compute reference
-    for(int i = 0; i < src.num_elements(); ++i)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         const Coordinates coord = index2coord(src.shape(), i);
         const Coordinates dst_coord{ coord.y(), coord.x() };
diff --git a/tests/validation/reference/UpsampleLayer.cpp b/tests/validation/reference/UpsampleLayer.cpp
index 8e36ee8..79d7267 100644
--- a/tests/validation/reference/UpsampleLayer.cpp
+++ b/tests/validation/reference/UpsampleLayer.cpp
@@ -93,7 +93,7 @@
     {
         SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
         SimpleTensor<float> dst_tmp = upsample_function<float>(src_tmp, info, policy);
-        dst                         = convert_to_asymmetric(dst_tmp, src.quantization_info());
+        dst                         = convert_to_asymmetric<uint8_t>(dst_tmp, src.quantization_info());
     }
     else
     {
diff --git a/tests/validation/reference/WarpAffine.cpp b/tests/validation/reference/WarpAffine.cpp
index 7b903b7..2a7aeb7 100644
--- a/tests/validation/reference/WarpAffine.cpp
+++ b/tests/validation/reference/WarpAffine.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,7 +63,8 @@
     const int width  = src.shape().x();
     const int height = src.shape().y();
 
-    for(int element_idx = 0; element_idx < src.num_elements(); ++element_idx)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
     {
         valid_mask[element_idx] = 1;
         Coordinates id          = index2coord(src.shape(), element_idx);
diff --git a/tests/validation/reference/WarpPerspective.cpp b/tests/validation/reference/WarpPerspective.cpp
index 7a50253..dc7420b 100644
--- a/tests/validation/reference/WarpPerspective.cpp
+++ b/tests/validation/reference/WarpPerspective.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,8 @@
     const int width  = src.shape().x();
     const int height = src.shape().y();
 
-    for(int element_idx = 0; element_idx < src.num_elements(); ++element_idx)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t element_idx = 0; element_idx < num_elements; ++element_idx)
     {
         valid_mask[element_idx] = 1;
         Coordinates id          = index2coord(src.shape(), element_idx);
diff --git a/tests/validation/reference/YOLOLayer.cpp b/tests/validation/reference/YOLOLayer.cpp
index a12f411..0011b85 100644
--- a/tests/validation/reference/YOLOLayer.cpp
+++ b/tests/validation/reference/YOLOLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,8 @@
     const T a(info.a());
     const T b(info.b());
 
-    for(int i = 0; i < src.num_elements(); ++i)
+    const uint32_t num_elements = src.num_elements();
+    for(uint32_t i = 0; i < num_elements; ++i)
     {
         const size_t z = index2coord(dst.shape(), i).z() % (num_classes + 5);
 
@@ -68,7 +69,7 @@
 {
     SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
     SimpleTensor<float>   dst_tmp = yolo_layer<float>(src_tmp, info, num_classes);
-    SimpleTensor<uint8_t> dst     = convert_to_asymmetric(dst_tmp, src.quantization_info());
+    SimpleTensor<uint8_t> dst     = convert_to_asymmetric<uint8_t>(dst_tmp, src.quantization_info());
     return dst;
 }